Open stweil opened 10 months ago
Would be interesting to see the files of OCR-D-SEG-LINE-RESEG-DEWARP
.
Here is OCR-D-SEG-LINE-RESEG-DEWARP/OCR-D-SEG-LINE-RESEG-DEWARP_0001.xml
:
<?xml version="1.0" encoding="UTF-8"?>
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-SEG-LINE-RESEG-DEWARP_0001">
<pc:Metadata>
<pc:Creator>OCR-D/core 2.49.0</pc:Creator>
<pc:Created>2024-01-11T05:41:57.730360</pc:Created>
<pc:LastChange>2024-01-11T05:41:57.730360</pc:LastChange>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/binarization" value="ocrd-cis-ocropy-binarize">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="ocropy" type="method"/>
<pc:Label value="0.5" type="threshold"/>
<pc:Label value="False" type="grayscale"/>
<pc:Label value="0.0" type="maxskew"/>
<pc:Label value="0" type="noise_maxsize"/>
<pc:Label value="0" type="dpi"/>
<pc:Label value="page" type="level-of-operation"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.1.5" type="ocrd-cis-ocropy-binarize"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/cropping" value="ocrd-tesserocr-crop">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="0" type="dpi"/>
<pc:Label value="4" type="padding"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.17.0 (tesseract 5.3.0-46-g1569)" type="ocrd-tesserocr-crop"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/binarization" value="ocrd-skimage-binarize">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="li" type="method"/>
<pc:Label value="page" type="level-of-operation"/>
<pc:Label value="0" type="dpi"/>
<pc:Label value="0" type="window_size"/>
<pc:Label value="0.34" type="k"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.1.7" type="ocrd-skimage-binarize"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/despeckling" value="ocrd-skimage-denoise">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="page" type="level-of-operation"/>
<pc:Label value="0" type="dpi"/>
<pc:Label value="0.0" type="protect"/>
<pc:Label value="1.0" type="maxsize"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.1.7" type="ocrd-skimage-denoise"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/deskewing" value="ocrd-tesserocr-deskew">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="page" type="operation_level"/>
<pc:Label value="0" type="dpi"/>
<pc:Label value="1.5" type="min_orientation_confidence"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.17.0 (tesseract 5.3.0-46-g1569)" type="ocrd-tesserocr-deskew"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-cis-ocropy-segment">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="0" type="dpi"/>
<pc:Label value="region" type="level-of-operation"/>
<pc:Label value="20" type="maxcolseps"/>
<pc:Label value="20" type="maxseps"/>
<pc:Label value="10" type="maximages"/>
<pc:Label value="4" type="csminheight"/>
<pc:Label value="10" type="hlminwidth"/>
<pc:Label value="0.01" type="gap_height"/>
<pc:Label value="1.5" type="gap_width"/>
<pc:Label value="True" type="overwrite_order"/>
<pc:Label value="True" type="overwrite_separators"/>
<pc:Label value="True" type="overwrite_regions"/>
<pc:Label value="True" type="overwrite_lines"/>
<pc:Label value="2.4" type="spread"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.1.5" type="ocrd-cis-ocropy-segment"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/dewarping" value="ocrd-cis-ocropy-dewarp">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="0" type="dpi"/>
<pc:Label value="4.0" type="range"/>
<pc:Label value="1.0" type="smoothness"/>
<pc:Label value="0.05" type="max_neighbour"/>
</pc:Labels>
<pc:Labels externalModel="ocrd-tool" externalId="version">
<pc:Label value="0.1.5" type="ocrd-cis-ocropy-dewarp"/>
<pc:Label value="2.49.0" type="ocrd/core"/>
</pc:Labels>
</pc:MetadataItem>
</pc:Metadata>
<pc:Page imageFilename="OCR-D-IMG/OCR-D-IMG_0001.tif" imageWidth="1296" imageHeight="1855" orientation="0.124143090435354" readingDirection="left-to-right" textLineOrder="top-to-bottom">
<pc:AlternativeImage filename="OCR-D-BIN/OCR-D-BIN_0001.IMG-BIN.png" comments=",binarized"/>
<pc:AlternativeImage filename="OCR-D-CROP/OCR-D-CROP_0001.IMG-CROP.png" comments=",binarized,cropped"/>
<pc:AlternativeImage filename="OCR-D-BIN2/OCR-D-BIN2_0001.IMG-BIN.png" comments=",cropped,binarized"/>
<pc:AlternativeImage filename="OCR-D-BIN-DENOISE/OCR-D-BIN-DENOISE_0001.IMG-DEN.png" comments=",cropped,binarized,despeckled"/>
<pc:AlternativeImage filename="OCR-D-BIN-DENOISE-DESKEW/OCR-D-BIN-DENOISE-DESKEW_0001.IMG-DESKEW.png" comments=",cropped,binarized,despeckled,deskewed"/>
<pc:Border>
<pc:Coords points="17,140 1296,140 1296,1805 17,1805"/>
</pc:Border>
</pc:Page>
</pc:PcGts>
The QuiVer benchmark workflow selected_pages_ocr uses a process which binarizes twice. That gives an image which is too light for good OCR results (some characters are even missing completely). Nevertheless most of the text is still readable, to there should be some OCR result.
All data is now available online.
It also includes the generated page images, for example page 1 (binarized twice, denoised, deskewed).
There are no TextLines to recognize text from, so this is expected.
(I'm going on vacation in 2 hours so I'm not checking where the segmentation step is missing/going wrong, but I can check when I'm back)
Commit 3b32589fa0598212eb6864bd112edc4d8a79c86d removed a parameter for cis-ocropy-segment
, so that processor no longer produces the required text lines. cc @mweidling.
If that parameter is added again, some tests work fine, but others fail with a runtime error in cis-ocropy-segment
. See related issue for ocrd_cis.
Meanwhile I restored the line segmentation for the workflow and got OCR results at least for the tests where the segmentation process did not crash (see https://github.com/cisocrgroup/ocrd_cis/issues/94). It looks like the segmentation of a single newspaper page takes several hours (the first one is now running for 252 minutes, see https://github.com/cisocrgroup/ocrd_cis/issues/98). I am afraid that the whole workflow cannot be used in the benchmark tests because of that.
The workflow _selected_pagesocr uses more than 118 GiB of RAM while running OCR with calamari-recognize
for newspaper pages. A server with 128 GiB RAM starts swapping and gets nearly unusable.
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
3023389 stweil 20 0 6648 2132 1696 S 6.2 0.0 0:00.86 bash
3023393 stweil 20 0 207.9g 118.8g 44280 S 6.2 94.5 16:10.67 ocrd-calamari-r
Commit 3b32589 removed a parameter for
cis-ocropy-segment
, so that processor no longer produces the required text lines. cc @mweidling.
That change is faulty btw: default is level-of-operation=region
(for historic reasons), and since no prior segmentation happened, nothing will happen.
The workflow _selected_pagesocr uses more than 118 GiB of RAM while running OCR with
calamari-recognize
for newspaper pages. A server with 128 GiB RAM starts swapping and gets nearly unusable.PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 3023389 stweil 20 0 6648 2132 1696 S 6.2 0.0 0:00.86 bash 3023393 stweil 20 0 207.9g 118.8g 44280 S 6.2 94.5 16:10.67 ocrd-calamari-r
On which input data is that specifically? Your upload seems not to be up-to-date.
I'd gladly reproduce and debug if I had the workspace including the segmentation used. The "configuration" used is workflows/ocrd_workflows/selected_pages_ocr.txt, I take it?
That's right, selected_pages_ocr.txt is the workflow file.
It would help if I had a workspace up to ocrd-calamari-recognize
's input fileGrp (OCR-D-SEG-LINE-RESEG-DEWARP
?), so I can easily reproduce this.
From the looks of it, @bertsky seems to be right (above) and the workflow still doesn't produce line segmentation (only region segmentation), so this behaviour would be even more curious.
@stweil didn't we already establish (in the OCR-D Forum) that the version of ocrd_all used by Quiver at the time was hopelessly outdated? But I agree we should get to the bottom of this – with or without line segments, ocrd-calamari-recognize should not be allowed (or motivated) to allocate large amounts of memory.
But I agree we should get to the bottom of this – with or without line segments, ocrd-calamari-recognize should not be allowed (or motivated) to allocate large amounts of memory.
Yep. The way it works (line by line processing) it shouldn't happen, but a. I didn't test many newspaper pages myself and did that on a host with a lot of memory b. wouldn't be the first time to see a memory leak with TensorFlow.
(Should probably run processors with ulimit or in a cgroup)
(Should probably run processors with ulimit or in a cgroup)
Agreed! Could also be easily done in ocrd_all Docker images. Docker itself offers options like --memory 2GB
and --ulimit rss=2000000:4000000
, but we could also set something in the image's /etc/profile.d ...
(Should probably run processors with ulimit or in a cgroup)
Agreed! Could also be easily done in ocrd_all Docker images. Docker itself offers options like
--memory 2GB
and--ulimit rss=2000000:4000000
, but we could also set something in the image's /etc/profile.d ...
I have thoughts about this (for example, I don't think profile.d would work here), should we open an issue in ocrd_all then? Have to look into the "slim image" efforts anyway.
version of ocrd_all used by Quiver at the time was hopelessly outdated
It is still outdated, see issue #23. And I don't know whether there are plans and resources to change that.
I have thoughts about this (for example, I don't think profile.d would work here), should we open an issue in ocrd_all then?
@mikegerber I added it to https://github.com/OCR-D/ocrd_all/issues/280 – please add your ideas there.
Because I didn't have to workspace to debug the memory problem involving ocrd-calamari-recognize, I tried to run the selected_page_ocr
workflow on reichsanzeiger_random_selected_pages_ocr
(removed all filegroups except OCR-D-IMG and OCR-D-GT-SEG-LINE to start with) and encountered a different problem (using latest ocrd/all:maximum image):
15:51:21.928 INFO ocrd.task_sequence.run_tasks - Start processing task 'skimage-denoise -I OCR-D-BIN2 -O OCR-D-BIN-DENOISE -p '{"level-of-operation": "page", "dpi": 0, "protect": 0.0, "maxsize": 1.0}''
15:51:24.932 INFO processor.SkimageDenoise - INPUT FILE 0 / P_1879_45_0344
15:51:31.599 ERROR ocrd.processor.helpers.run_processor - Failure in processor 'ocrd-skimage-denoise'
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/ocrd/processor/helpers.py", line 130, in run_processor
processor.process()
File "/build/ocrd_wrap/ocrd_wrap/skimage_denoise.py", line 75, in process
page_image, page_coords, page_image_info = self.workspace.image_from_page(
File "/usr/local/lib/python3.8/site-packages/ocrd/workspace.py", line 781, in image_from_page
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
Exception: Found no AlternativeImage that satisfies all requirements selector="binarized" in page "P_1879_45_0344"
Workspace at this point - if someone wants to have a look: https://qurator-data.de/~mike.gerber/2024-02-quiver-benchmarks-issue-22/reichsanzeiger_random_selected_pages_ocr.zip (Includes a ocrd.log
)
At this point, I am not willing to look into this specific ocrd-calamari-recognize memory issue further, because I can't reproduce anything properly - it already involved guessing which original workspace it could have been and trying to run 7 processors. I am willing to look into it further, if I get the workspace in the state before ocrd-calamari-recognize ran, including OCR-D-SEG-LINE-RESEG-DEWARP.
I'll test with some other segmentation in https://github.com/OCR-D/ocrd_calamari/issues/110, just to make sure that there is no general issue.
The QuiVer benchmark workflow selected_pages_ocr uses a process which binarizes twice. That gives an image which is too light for good OCR results (some characters are even missing completely). Nevertheless most of the text is still readable, to there should be some OCR result.
I am not sure that the images are binarized twice. It runs the binarization twice, yes, but the second binarization step may just use the original image but cropped, via AlternativeImage.
@kba @bertsky It this correct? Is there a way to verify with the log? (In the ZIP in the comment above this)
@mikegerber exactly. All binarization processors filter avoid images on the input side (via feature_filter='binarized'
).
It's not a useful step IMHO, but it cannot hurt either.
The log would only detail this if you were to enable debug loggers for ocrd.workspace
.
The related workflows all end with CER / WER 1.0, so no text is recognized by Calamari.
A manual run for a single GT terminates in less than 1 second without error message, but also without a usable result: