OCR-D / ocrd-website

Creative Commons Attribution 4.0 International
24 stars 7 forks source link

Workflow documentation, AlternativeImage #174

Closed jbarth-ubhd closed 1 year ago

jbarth-ubhd commented 3 years ago

I'll have here a small perl script generating workflow variants according to the current documentation (Steps 0..14). No for(...) if 1 processor is recommended

@bertsky, could you have a look at this, especially You cannot use clipping after any processor that already adds derived images (AlternativeImage) on the same hierarchy level ?

#!/usr/bin/perl
use strict;

sub setFlags {
    my $s = shift @_;

    my %store;
    for my $okv ( grep { length $_ } split /\s+/, $s ) {
        my ( $ok, $ov ) = split /=/, $okv, 2;
        $store{$ok} = $ov;
    }
    for ( my $i = 0 ; $i < scalar(@_) ; $i += 2 ) {
        $store{ $_[$i] } = $_[ $i + 1 ];
    }

    my $out = "";
    while ( my ( $x, $y ) = each %store ) {
        $out .= " $x=$y";
    }
    return substr( $out, 1 );
}

sub step0 {
    my ( $w, $flags ) = @_;

    # Step 0: Image Enhancement (Page Level, optional)
    step1( $w, $flags );
}

sub step1 {
    my ( $w, $flags ) = @_;

    # Step 1: Binarization (Page Level)
    # Many of the following processors require binarized images.
    step2( $w . "ocrd-olena-binarize -I OCR-D-aaa -O OCR-D-BIN\n", $flags )
      ;    # TODO wolf
}

sub step2 {
    my ( $w, $flags ) = @_;

# Step 2: Cropping (Page Level)
# We strongly recommend to execute this step if your images are not cropped already
    for my $crop (
        "ocrd-anybaseocr-crop -I OCR-D-BIN -O OCR-D-CROP",
        "ocrd-tesserocr-crop -I OCR-D-BIN -O OCR-D-CROP"
      )
    {
        step3( $w . "$crop\n", $flags );
    }
}

sub step3 {
    my ( $w, $flags ) = @_;

# Step 3: Binarization (Page Level)
# For better results, the cropped images can be binarized again at this point or later on (on region level).
    step4( $w . "ocrd-olena-binarize -I OCR-D-CROP -O OCR-D-BIN2\n", $flags )
      ;    # TODO wolf
}

sub step4 {
    my ( $w, $flags ) = @_;

# Step 4: Denoising (Page Level)
# This may not be necessary for all prints, and depends heavily on the selected binarization algorithm.
    for my $denoise (
        "ocrd-cis-ocropy-denoise -I OCR-D-BIN2 -O OCR-D-DENOISE",
        "ocrd-skimage-denoise -I OCR-D-BIN2 -O OCR-D-DENOISE"
      )
    {
        step5( $w . "$denoise\n", $flags );
    }
}

sub step5 {
    my ( $w, $flags ) = @_;

    # Step 5: Deskewing (Page Level)
    # The input images have to be binarized for this module to work.
    step6(
        $w
          . "ocrd-cis-ocropy-deskew -I OCR-D-DENOISE -O OCR-D-DESKEW-PAGE -P level-of-operation page\n",
        $flags
    );
}

sub step6 {
    my ( $w, $flags ) = @_;

    # Step 6: Dewarping (Page Level)
    # The input image has to be binarized for the module to work.
    # GPU REQUIRED!
    step7( $w, $flags );
}

sub step7 {
    my ( $w, $flags ) = @_;

# Step 7: Region segmentation
# Note: If you use ocrd-tesserocr-segment-region, which uses only bounding boxes instead of polygon coordinates, then you should post-process via ocrd-segment-repair with plausibilize=True to obtain better results without large overlaps.
# Note: The ocrd-sbb-textline-detector and ocrd-cis-ocropy-segment processors do not only segment the page, but also the text lines within the detected text regions in one step. Therefore with those (and only with those!) processors you don’t need to segment into lines in an extra step.
    for my $regSeg (
"ocrd-tesserocr-segment-region -I OCR-D-DEWARP-PAGE -O OCR-D-SEG-REG-TOBEREPAIRED && ocrd-segment-repair -I OCR-D-SEG-REG-TOBEREPAIRED -O OCR-D-SEG-REG -P plausibilize true",
"ocrd-sbb-textline-detector -I OCR-D-DEWARP-PAGE -O OCR-D-SEG-LINE -P model /path/to/model",
"ocrd-cis-ocropy-segment -I OCR-D-DEWARP-PAGE -O OCR-D-SEG-LINE -P level-of-operation page",
"ocrd-anybaseocr-block-segmentation -I OCR-D-DEWARP-PAGE -O OCR-D-SEG-REG -P block_segmentation_model /path/to/mrcnn -P block_segmentation_weights /path/to/model/block_segmentation_weights.h5",
        "ocrd-pc-segmentation -I OCR-D-DEWARP-PAGE -O OCR-D-SEG-REG"
      )
    {
        step8(
            $w . "$regSeg\n",
            setFlags(
                $flags,
                "lineSegDone",
                $regSeg =~
                  /^(ocrd-sbb-textline-detector|ocrd-cis-ocropy-segment)/
                ? 1
                : 0
            )
        );
    }
}

sub step8 {
    my ( $w, $flags ) = @_;

# Step 8: Binarization (Region Level)
# If you already binarized your image twice on page level, and have no large images, you can probably SKIP this step.
    step9( $w, $flags );
}

sub step9 {
    my ( $w, $flags ) = @_;

    # Step 9: Deskewing (Region Level)
    step10(
        $w
          . "ocrd-cis-ocropy-deskew -I OCR-D-BIN-REG -O OCR-D-DESKEW-REG -P level-of-operation region\n",
        $flags
    );
}

sub step10 {
    my ( $w, $flags ) = @_;

    # Step 10: Clipping (Region Level)
    step11(
        $w
          . "ocrd-cis-ocropy-clip -I OCR-D-DESKEW-REG -O OCR-D-CLIP-REG -P level-of-operation region\n",
        $flags
    );
}

sub step11 {
    my ( $w, $flags ) = @_;

# Step 11: Line segmentation
# Note: If you use ocrd-tesserocr-segment-line, which uses only bounding boxes instead of polygon coordinates, then you should post-process with the processors described in Step 12. If you use ocrd-cis-ocropy-segment, you can directly go on with Step 13.
# Note: As described in Step 7, ocrd-sbb-textline-detector and ocrd-cis-ocropy-segment do not only segment the page, but also the text lines within the detected text regions in one step. Therefore with those (and only with those!) processors you don’t need to segment into lines in an extra step.

    if ( $flags =~ /\blineSegDone=0\b/ ) {
        for my $lineSeg (
"ocrd-cis-ocropy-segment -I OCR-D-CLIP-REG -O OCR-D-SEG-LINE -P level-of-operation region",
            "ocrd-tesserocr-segment-line -I OCR-D-CLIP-REG -O OCR-D-SEG-LINE"
          )
        {
            step12(
                $w . "$lineSeg\n",
                setFlags(
                    $flags, "mustResegmentLine",
                    $lineSeg =~ /^ocrd-tesserocr-segment-line/ ? 1 : 0
                )
            );
        }
    }
    else {
        step13( $w, $flags );
    }
}

sub step12 {
    my ( $w, $flags ) = @_;
    if ( $flags =~ /\bmustResegmentLine=1\b/ ) {
        for my $resegLine (
"ocrd-cis-ocropy-clip -I OCR-D-SEG-LINE -O OCR-D-CLIP-LINE -P level-of-operation line",
            "ocrd-cis-ocropy-resegment -I OCR-D-SEG-LINE -O OCR-D-RESEG"
          )
        {
            step13( $w . "$resegLine\n", $flags );
        }
    }
    else {
        step13( $w, $flags );
    }
}

sub step13 {
    my ( $w, $flags ) = @_;
    step14(
        $w . "ocrd-cis-ocropy-dewarp -I OCR-D-CLIP-LINE -O OCR-D-DEWARP-LINE\n",
        $flags
    );
}

sub step14 {
    my ( $w, $flags ) = @_;
    for my $ocr (
'TESSDATA_PREFIX="/test/data/tesseractmodels/" ocrd-tesserocr-recognize -I OCR-D-DEWARP-LINE -O OCR-D-zzz -P model Fraktur',
"ocrd-calamari-recognize -I OCR-D-DEWARP-LINE -O OCR-D-zzz -P checkpoint /path/to/models/\*.ckpt.json"
      )
    {
        printStep( $w . "$ocr\n", $flags );
    }
}

sub printStep {
    my ( $w, $flags ) = @_;

    my $n = 3;
    $w =~ s!(-[IO] OCR-D-)([A-Z0-9-]+)\b!sprintf("%s%03d", $1, int($n++/2))!ge;
    $w =~ s!OCR-D-aaa!OCR-D-IMG!g;
    $w =~ s!OCR-D-zzz!OCR-D-OCR!g;

    print "$w # $flags\n---\n";
}

step0( "", "" );
bertsky commented 3 years ago

Looks good, except for steps 9 and 10, which should be exchanged (clipping before deskewing).

jbarth-ubhd commented 3 years ago

Sorry, I'm not sure if I already mentioned this: Steps 9 & 10 are here https://ocr-d.de/en/workflows (still) in order "Deskewing, Clipping"

kba commented 3 years ago

Looks good, except for steps 9 and 10, which should be exchanged (clipping before deskewing).

https://github.com/OCR-D/ocrd-website/pull/178

bertsky commented 1 year ago

@jbarth-ubhd can we close this?

jbarth-ubhd commented 1 year ago

quite old issue