Fewer lstmf files being generated as compared to the ground truth

AyushP123 commented 5 years ago

Tesseract Version: 4.1.0

I am trying to fine tune tesseract on custom dataset with the following Makefile:

export

SHELL := /bin/bash
HOME := $(PWD)
TESSDATA = $(HOME)/tessdata
LANGDATA = $(HOME)/langdata

# Train directory
# TRAIN := $(HOME)/train_data
TRAIN := /media/vimaan/Data/OCR/tesseract_train

# Name of the model to be built
MODEL_NAME = eng
LANG_CODE = eng

# Name of the model to continue from
CONTINUE_FROM = eng

TESSDATA_REPO = _best

# Normalization Mode - see src/training/language_specific.sh for details 
NORM_MODE = 1

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
    @echo ""
    @echo "  Targets"
    @echo ""
    @echo "    unicharset       Create unicharset"
    @echo "    lists            Create lists of lstmf filenames for training and eval"
    @echo "    training         Start training"
    @echo "    proto-model      Build the proto model"
    @echo "    leptonica        Build leptonica"
    @echo "    tesseract        Build tesseract"
    @echo "    tesseract-langs  Download tesseract-langs"
    @echo "    langdata         Download langdata"
    @echo "    clean            Clean all generated files"
    @echo ""
    @echo "  Variables"
    @echo ""
    @echo "    MODEL_NAME         Name of the model to be built"
    @echo "    CORES              No of cores to use for compiling leptonica/tesseract"
    @echo "    LEPTONICA_VERSION  Leptonica version. Default: $(LEPTONICA_VERSION)"
    @echo "    TESSERACT_VERSION  Tesseract commit. Default: $(TESSERACT_VERSION)"
    @echo "    LANGDATA_VERSION   Tesseract langdata version. Default: $(LANGDATA_VERSION)"
    @echo "    TESSDATA_REPO      Tesseract model repo to use. Default: $(TESSDATA_REPO)"
    @echo "    TRAIN              Train directory"
    @echo "    RATIO_TRAIN        Ratio of train / eval training data"

# END-EVAL

# Ratio of train / eval training data
RATIO_TRAIN := 0.90

ALL_BOXES = data/all-boxes
ALL_LSTMF = data/all-lstmf

# Create unicharset
unicharset: data/unicharset

# Create lists of lstmf filenames for training and eval
#lists: $(ALL_LSTMF) data/list.train data/list.eval
lists: $(ALL_LSTMF)

train-lists: data/list.train data/list.eval

data/list.train: $(ALL_LSTMF)
    total=`cat $(ALL_LSTMF) | wc -l` \
       no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
       head -n "$$no" $(ALL_LSTMF) > "$@"

data/list.eval: $(ALL_LSTMF)
    total=`cat $(ALL_LSTMF) | wc -l` \
       no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
       tail -n "$$no" $(ALL_LSTMF) > "$@"

# Start training
training: data/$(MODEL_NAME).traineddata

data/unicharset: $(ALL_BOXES)
    mkdir -p data/$(START_MODEL)
    combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata  $(TESSDATA)/$(CONTINUE_FROM).
    unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" --norm_mode $(NORM_MODE) "$(ALL_BOXES)"
    #merge_unicharsets data/$(START_MODEL)/$(START_MODEL).lstm-unicharset $(GROUND_TRUTH_DIR)/my.unicharset  "$@"
    merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset $(TRAIN)/my.unicharset  "$@"

$(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif)))
    find $(TRAIN) -name '*.box' -exec cat {} \; > "$@"

$(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%.gt.txt
    python generate_line_box.py -i "$(TRAIN)/$*.tif" -t "$(TRAIN)/$*.gt.txt" > "$@"

$(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif)))
    find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"

$(TRAIN)/%.lstmf: $(TRAIN)/%.box
    tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --dpi 300 --psm 7 lstm.train

# Build the proto model
proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata

data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset
    combine_lang_model \
      --input_unicharset data/unicharset \
      --script_dir $(LANGDATA) \
      --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \
      --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \
      --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \
      --output_dir data/ \
      --lang $(MODEL_NAME)

data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset proto-model
    mkdir -p data/checkpoints
    lstmtraining \
      --continue_from   $(TESSDATA)/$(CONTINUE_FROM).lstm \
      --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
      --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
      --model_output data/checkpoints/$(MODEL_NAME) \
      --debug_interval -1 \
      --train_listfile data/list.train \
      --eval_listfile data/list.eval \
      --sequential_training \
      --max_iterations 170000

data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint
    lstmtraining \
    --stop_training \
    --continue_from $^ \
    --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
    --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
    --model_output $@

# Clean all generated files
clean:
    find data/train -name '*.box' -delete
    find data/train -name '*.lstmf' -delete
    rm -rf data/all-*
    rm -rf data/list.*
    rm -rf data/$(MODEL_NAME)
    rm -rf data/unicharset
    rm -rf data/checkpoints

The number of .lstmf files being generated is significantly lower than .box files being generated. For eg: Number of .tif files: 10k Number of .gt.txt files: 10k Number of .box files: 10k Number of .lstmf files: 8k.

Could anyone point me out to the possible reasons for this issue.

stweil commented 5 years ago

See https://github.com/tesseract-ocr/tesstrain/wiki/GT4HistOCR#tesseract-fails-to-create-lstm-files for a possible reason and solution. That article also lists other potential problems.

AyushP123 commented 5 years ago

Thanks for your response @stweil. It really helped. Using psm 7 was the issue, with psm 13 all empty string outputs got eliminated. I think that's a bug with Tesseract.

stweil commented 5 years ago

We still need a test how using psm 13 instead of 7 changes the training results. If there are no negative effects, the default psm should be changed to 13.

tesseract-ocr / tesstrain

Fewer lstmf files being generated as compared to the ground truth #90