Closed mikegerber closed 3 years ago
Workspace: PPN768641977.zip
% dinglehopper-extract FULLTEXT/FILE_0002_FULLTEXT
[...]
Warum sollten wir nutzlos sterben? Der Kamps
kraft von Amerikanern im Felde dringen die Entente-
armeeu auf allen Kriegsschauplätzen schnell vor.
[ ... some lines later ... ]
ist nutzlos geworden. Selbst niit einer kleinen Streit-
[ ... ]
In the XML (FILE_0002_FULLTEXT), the line in question comes in Block9, which comes after Block8 with the text after the line. So the ALTO block order in the file is incorrect (not the extraction in dinglehopper).
<TextBlock ID="Page1_Block8" HEIGHT="363" WIDTH="820" VPOS="1112" HPOS="230" language="de">
<Shape>
<Polygon POINTS="245,1123 1062,1123 1062,1478 245,1478 245,1123"/>
</Shape>
<TextLine HEIGHT="45" WIDTH="803" VPOS="1118" HPOS="238">
<String STYLEREFS="font0" WC="0.7779999971" CONTENT="kraft" HEIGHT="33" WIDTH="65" VPOS="1128" HPOS="238"/>
<SP WIDTH="11" VPOS="1130" HPOS="304"/>
<String STYLEREFS="font0" WC="0.5199999809" CONTENT="von" HEIGHT="23" WIDTH="52" VPOS="1132" HPOS="316"/>
<SP WIDTH="10" VPOS="1126" HPOS="369"/>
<String STYLEREFS="font0" WC="0.6700000167" CONTENT="Amerikanern" HEIGHT="31" WIDTH="186" VPOS="1124" HPOS="380"/>
<SP WIDTH="10" VPOS="1124" HPOS="567"/>
<String STYLEREFS="font0" WC="0.6399999857" CONTENT="im" HEIGHT="30" WIDTH="38" VPOS="1123" HPOS="578"/>
<SP WIDTH="17" VPOS="1124" HPOS="617"/>
<String STYLEREFS="font0" WC="0.7480000257" CONTENT="Felde" HEIGHT="36" WIDTH="80" VPOS="1123" HPOS="635"/>
<SP WIDTH="12" VPOS="1124" HPOS="716"/>
<String STYLEREFS="font0" WC="0.7228571177" CONTENT="dringen" HEIGHT="35" WIDTH="111" VPOS="1122" HPOS="729"/>
<SP WIDTH="16" VPOS="1121" HPOS="841"/>
<String STYLEREFS="font0" WC="0.8466666937" CONTENT="die" HEIGHT="29" WIDTH="41" VPOS="1120" HPOS="858"/>
<SP WIDTH="16" VPOS="1119" HPOS="900"/>
<String STYLEREFS="font0" WC="0.7599999905" CONTENT="Entente-" HEIGHT="30" WIDTH="124" VPOS="1118" HPOS="917"/>
</TextLine>
<TextLine HEIGHT="43" WIDTH="696" VPOS="1158" HPOS="239">
<String STYLEREFS="font0" WC="0.7933333516" CONTENT="armeeu" HEIGHT="21" WIDTH="106" VPOS="1172" HPOS="239"/>
<SP WIDTH="13" VPOS="1172" HPOS="346"/>
<String STYLEREFS="font0" WC="0.603333354" CONTENT="auf" HEIGHT="34" WIDTH="45" VPOS="1164" HPOS="360"/>
<SP WIDTH="12" VPOS="1164" HPOS="406"/>
<String STYLEREFS="font0" WC="0.7300000191" CONTENT="allen" HEIGHT="29" WIDTH="69" VPOS="1163" HPOS="419"/>
<SP WIDTH="10" VPOS="1163" HPOS="489"/>
<String STYLEREFS="font0" WC="0.6894444227" CONTENT="Kriegsschauplätzen" HEIGHT="39" WIDTH="264" VPOS="1160" HPOS="500"/>
<SP WIDTH="11" VPOS="1160" HPOS="765"/>
<String STYLEREFS="font0" WC="0.8557142615" CONTENT="schnell" HEIGHT="36" WIDTH="90" VPOS="1159" HPOS="777"/>
<SP WIDTH="11" VPOS="1159" HPOS="868"/>
<String STYLEREFS="font0" WC="0.7149999738" CONTENT="vor." HEIGHT="24" WIDTH="55" VPOS="1163" HPOS="880"/>
</TextLine>
<TextLine HEIGHT="45" WIDTH="749" VPOS="1194" HPOS="292">
<String STYLEREFS="font0" WC="0.7450000048" CONTENT="Wenn" HEIGHT="29" WIDTH="87" VPOS="1202" HPOS="292"/>
<SP WIDTH="24" VPOS="1208" HPOS="380"/>
<String STYLEREFS="font0" WC="0.7466666698" CONTENT="wir" HEIGHT="30" WIDTH="48" VPOS="1200" HPOS="404"/>
<SP WIDTH="29" VPOS="1202" HPOS="453"/>
<String STYLEREFS="font0" WC="0.7966666818" CONTENT="bis" HEIGHT="30" WIDTH="40" VPOS="1200" HPOS="483"/>
<SP WIDTH="28" VPOS="1204" HPOS="524"/>
<String STYLEREFS="font0" WC="0.9275000095" CONTENT="nächstes" HEIGHT="36" WIDTH="111" VPOS="1200" HPOS="553"/>
<SP WIDTH="34" VPOS="1201" HPOS="665"/>
<String STYLEREFS="font0" WC="0.8349999785" CONTENT="Jahr" HEIGHT="35" WIDTH="71" VPOS="1200" HPOS="700"/>
<SP WIDTH="30" VPOS="1198" HPOS="771"/>
<String STYLEREFS="font0" WC="0.9700000286" CONTENT="so" HEIGHT="34" WIDTH="26" VPOS="1198" HPOS="801"/>
<SP WIDTH="29" VPOS="1202" HPOS="828"/>
<String STYLEREFS="font0" WC="0.8766666651" CONTENT="weitergehen," HEIGHT="37" WIDTH="184" VPOS="1194" HPOS="857"/>
</TextLine>
<TextLine HEIGHT="44" WIDTH="802" VPOS="1233" HPOS="239">
<String STYLEREFS="font0" WC="0.7883333564" CONTENT="werden" HEIGHT="29" WIDTH="106" VPOS="1241" HPOS="239"/>
<SP WIDTH="13" VPOS="1245" HPOS="346"/>
<String STYLEREFS="font0" WC="0.6633333564" CONTENT="wir" HEIGHT="28" WIDTH="46" VPOS="1240" HPOS="360"/>
<SP WIDTH="13" VPOS="1247" HPOS="407"/>
<String STYLEREFS="font0" WC="0.8209090829" CONTENT="ausgerottet" HEIGHT="33" WIDTH="162" VPOS="1241" HPOS="421"/>
<SP WIDTH="12" VPOS="1242" HPOS="584"/>
<String STYLEREFS="font0" WC="0.6566666961" CONTENT="werden" HEIGHT="30" WIDTH="105" VPOS="1238" HPOS="597"/>
<SP WIDTH="17" VPOS="1244" HPOS="703"/>
<String STYLEREFS="font0" WC="0.6933333278" CONTENT="von" HEIGHT="22" WIDTH="53" VPOS="1244" HPOS="721"/>
<SP WIDTH="17" VPOS="1238" HPOS="775"/>
<String STYLEREFS="font0" WC="0.8633333445" CONTENT="der" HEIGHT="29" WIDTH="46" VPOS="1237" HPOS="793"/>
<SP WIDTH="15" VPOS="1243" HPOS="840"/>
<String STYLEREFS="font0" WC="0.7618181705" CONTENT="ungeheueren" HEIGHT="38" WIDTH="185" VPOS="1233" HPOS="856"/>
</TextLine>
<TextLine HEIGHT="45" WIDTH="804" VPOS="1270" HPOS="238">
<String STYLEREFS="font0" WC="0.6616666913" CONTENT="Anzahl" HEIGHT="36" WIDTH="102" VPOS="1278" HPOS="238"/>
<SP WIDTH="24" VPOS="1278" HPOS="340"/>
<String STYLEREFS="font0" WC="0.7990909219" CONTENT="Amerikaner," HEIGHT="36" WIDTH="177" VPOS="1276" HPOS="364"/>
<SP WIDTH="26" VPOS="1278" HPOS="541"/>
<String STYLEREFS="font0" WC="0.8633333445" CONTENT="die" HEIGHT="30" WIDTH="42" VPOS="1276" HPOS="567"/>
<SP WIDTH="25" VPOS="1278" HPOS="609"/>
<String STYLEREFS="font0" WC="0.6575000286" CONTENT="dann" HEIGHT="28" WIDTH="71" VPOS="1277" HPOS="634"/>
<SP WIDTH="18" VPOS="1284" HPOS="705"/>
<String STYLEREFS="font0" WC="0.3700000048" CONTENT="am" HEIGHT="20" WIDTH="45" VPOS="1285" HPOS="723"/>
<SP WIDTH="21" VPOS="1276" HPOS="768"/>
<String STYLEREFS="font0" WC="0.7083333135" CONTENT="Kampfe" HEIGHT="37" WIDTH="116" VPOS="1273" HPOS="789"/>
<SP WIDTH="23" VPOS="1273" HPOS="905"/>
<String STYLEREFS="font0" WC="0.928888917" CONTENT="beteiligt" HEIGHT="37" WIDTH="114" VPOS="1270" HPOS="928"/>
</TextLine>
<TextLine HEIGHT="36" WIDTH="139" VPOS="1316" HPOS="240">
<String STYLEREFS="font0" WC="0.8774999976" CONTENT="sein" HEIGHT="35" WIDTH="52" VPOS="1317" HPOS="240"/>
<SP WIDTH="12" VPOS="1323" HPOS="293"/>
<String STYLEREFS="font0" WC="0.9179999828" CONTENT="wird." HEIGHT="30" WIDTH="73" VPOS="1316" HPOS="306"/>
</TextLine>
<TextLine HEIGHT="45" WIDTH="749" VPOS="1347" HPOS="294">
<String STYLEREFS="font0" WC="0.7583333254" CONTENT="Unsere" HEIGHT="35" WIDTH="95" VPOS="1355" HPOS="294"/>
<SP WIDTH="32" VPOS="1354" HPOS="390"/>
<String STYLEREFS="font0" WC="0.9116666913" CONTENT="Führer" HEIGHT="36" WIDTH="103" VPOS="1353" HPOS="422"/>
<SP WIDTH="37" VPOS="1359" HPOS="526"/>
<String STYLEREFS="font0" WC="0.7799999714" CONTENT="warer" HEIGHT="25" WIDTH="85" VPOS="1358" HPOS="564"/>
<SP WIDTH="36" VPOS="1359" HPOS="650"/>
<String STYLEREFS="font0" WC="0.7419999838" CONTENT="wahnsinnig" HEIGHT="37" WIDTH="160" VPOS="1351" HPOS="687"/>
<SP WIDTH="38" VPOS="1359" HPOS="848"/>
<String STYLEREFS="font0" WC="0.7720000148" CONTENT="genug" HEIGHT="29" WIDTH="84" VPOS="1357" HPOS="887"/>
<SP WIDTH="31" VPOS="1349" HPOS="972"/>
<String STYLEREFS="font0" WC="0.8266666532" CONTENT="die" HEIGHT="30" WIDTH="39" VPOS="1347" HPOS="1004"/>
</TextLine>
<TextLine HEIGHT="44" WIDTH="803" VPOS="1386" HPOS="239">
<String STYLEREFS="font0" WC="0.9200000167" CONTENT="Amerikaner" HEIGHT="30" WIDTH="168" VPOS="1393" HPOS="239"/>
<SP WIDTH="11" VPOS="1394" HPOS="408"/>
<String STYLEREFS="font0" WC="0.8274999857" CONTENT="dazu" HEIGHT="34" WIDTH="65" VPOS="1393" HPOS="420"/>
<SP WIDTH="12" VPOS="1401" HPOS="486"/>
<String STYLEREFS="font0" WC="0.7200000286" CONTENT="zu" HEIGHT="27" WIDTH="31" VPOS="1400" HPOS="499"/>
<SP WIDTH="12" VPOS="1394" HPOS="531"/>
<String STYLEREFS="font0" WC="0.791428566" CONTENT="treiben" HEIGHT="30" WIDTH="100" VPOS="1391" HPOS="544"/>
<SP WIDTH="12" VPOS="1392" HPOS="645"/>
<String STYLEREFS="font0" WC="0.8633333445" CONTENT="die" HEIGHT="29" WIDTH="39" VPOS="1391" HPOS="658"/>
<SP WIDTH="13" VPOS="1391" HPOS="698"/>
<String STYLEREFS="font0" WC="0.7516666651" CONTENT="Waffen" HEIGHT="36" WIDTH="107" VPOS="1389" HPOS="712"/>
<SP WIDTH="18" VPOS="1398" HPOS="820"/>
<String STYLEREFS="font0" WC="0.6399999857" CONTENT="gegen" HEIGHT="28" WIDTH="82" VPOS="1396" HPOS="839"/>
<SP WIDTH="18" VPOS="1396" HPOS="922"/>
<String STYLEREFS="font0" WC="0.7166666389" CONTENT="uns" HEIGHT="25" WIDTH="52" VPOS="1391" HPOS="941"/>
<SP WIDTH="18" VPOS="1391" HPOS="994"/>
<String STYLEREFS="font0" WC="0.6800000072" CONTENT="zu" HEIGHT="26" WIDTH="29" VPOS="1395" HPOS="1013"/>
</TextLine>
<TextLine HEIGHT="44" WIDTH="770" VPOS="1425" HPOS="239">
<String STYLEREFS="font0" WC="0.8370000124" CONTENT="ergreifen." HEIGHT="37" WIDTH="138" VPOS="1432" HPOS="239"/>
<SP WIDTH="35" VPOS="1432" HPOS="378"/>
<String STYLEREFS="font0" WC="0.862857163" CONTENT="Dadurch" HEIGHT="36" WIDTH="121" VPOS="1430" HPOS="414"/>
<SP WIDTH="13" VPOS="1430" HPOS="536"/>
<String STYLEREFS="font0" WC="0.7160000205" CONTENT="wurde" HEIGHT="29" WIDTH="91" VPOS="1430" HPOS="550"/>
<SP WIDTH="9" VPOS="1438" HPOS="642"/>
<String STYLEREFS="font0" WC="0.8866666555" CONTENT="unsere" HEIGHT="35" WIDTH="93" VPOS="1428" HPOS="652"/>
<SP WIDTH="10" VPOS="1428" HPOS="746"/>
<String STYLEREFS="font0" WC="0.6050000191" CONTENT="Niederlage" HEIGHT="38" WIDTH="157" VPOS="1426" HPOS="757"/>
<SP WIDTH="12" VPOS="1426" HPOS="915"/>
<String STYLEREFS="font0" WC="0.8571428657" CONTENT="sicher." HEIGHT="35" WIDTH="81" VPOS="1425" HPOS="928"/>
</TextLine>
</TextBlock>
<TextBlock ID="Page1_Block9" HEIGHT="57" WIDTH="813" VPOS="1073" HPOS="231" language="de">
<Shape>
<Polygon POINTS="247,1084 1059,1084 1059,1133 247,1133 247,1084"/>
</Shape>
<TextLine HEIGHT="45" WIDTH="801" VPOS="1079" HPOS="237">
<String STYLEREFS="font0" WC="0.8066666722" CONTENT="ist" HEIGHT="35" WIDTH="31" VPOS="1089" HPOS="237"/>
<SP WIDTH="11" VPOS="1092" HPOS="268"/>
<String STYLEREFS="font0" WC="0.6285714507" CONTENT="nutzlos" HEIGHT="35" WIDTH="97" VPOS="1088" HPOS="279"/>
<SP WIDTH="12" VPOS="1091" HPOS="376"/>
<String STYLEREFS="font0" WC="0.6655555367" CONTENT="geworden." HEIGHT="36" WIDTH="149" VPOS="1086" HPOS="388"/>
<SP WIDTH="47" VPOS="1087" HPOS="537"/>
<String STYLEREFS="font0" WC="0.853333354" CONTENT="Selbst" HEIGHT="36" WIDTH="89" VPOS="1085" HPOS="584"/>
<SP WIDTH="13" VPOS="1088" HPOS="673"/>
<String STYLEREFS="font0" WC="0.9399999976" CONTENT="niit" HEIGHT="30" WIDTH="48" VPOS="1084" HPOS="686"/>
<SP WIDTH="13" VPOS="1086" HPOS="734"/>
<String STYLEREFS="font0" WC="0.7979999781" CONTENT="einer" HEIGHT="30" WIDTH="73" VPOS="1083" HPOS="747"/>
<SP WIDTH="9" VPOS="1083" HPOS="820"/>
<String STYLEREFS="font0" WC="0.5771428347" CONTENT="kleinen" HEIGHT="31" WIDTH="100" VPOS="1080" HPOS="829"/>
<SP WIDTH="10" VPOS="1080" HPOS="929"/>
<String STYLEREFS="font0" WC="0.687142849" CONTENT="Streit-" HEIGHT="30" WIDTH="99" VPOS="1079" HPOS="939"/>
</TextLine>
</TextBlock>
For comparison, the extracted text from the Calamari OCR:
Warum ſollten wir nutzlos ſterben? Der Kampf iſt nutzlos geworden. Selbſt mit einer kleinen Streit- kraft von Amerikanern im Felde dringen die Entente- armeen auf allen Kriegsſchauplätzen ſchnell vor.