I use the DocTR to scan a PDF document for OCR. It's a large multi-page document with lots of weird characters. DocTR does a great job detecting and converting it to text. (It works shockingly well actually. ) Only problem, we've found is as follows:
Some sub-headings in our document are structured like: "This is the sub-title : --------" After the : the rest of the sentence is filled with ----- (This is to make sure you can't doctor the document after the fact.) We've tested about 100 documents that have this and DocTR structurally messes up all the sub-headings. It does detect and convert to text well but for some reason it puts the sub-headings at the end of text. The geometry coordinates are strange.
A good example is the sub-heading on line 9 of the text. It says: "De comparante verklaarde: --------".
I've attached an example PDF document so you guys can try it yourself.
Splitsingsakte.pdf
If you have any questions feel free to reach out to me Maxvankekeren-it@outlook.com. I have many more testing documents if needs be.
Code snippet to reproduce the bug
#install DocTR
!pip install python-doctr
#import dependencies
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
#setup OCR model
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
# get document from PDF
doc = DocumentFile.from_pdf("test.pdf")
result = model(doc)
result.render()
It works great but again sub-headings ending with : ----- get's strange placement.
Error traceback
DocTR gives no error. It thinks it did a good job and 99% it did. The only issue is that the sub-headings mentioned earlier are in the complete wrong location.
Environment
Copyright (C) 2021-2022, Mindee.
This program is licensed under the Apache License 2.0.
def run_and_read_all(run_lambda, command):
"""Runs command using runlambda; reads and returns entire output if rc is 0"""
rc, out, = run_lambda(command)
if rc != 0:
return None
return out
def run_and_parse_first_match(run_lambda, command, regex):
"""Runs command using runlambda, returns the first regex match if it exists"""
rc, out, = run_lambda(command)
if rc != 0:
return None
match = re.search(regex, out)
if match is None:
return None
return match.group(1)
def get_cudnn_version(run_lambda):
"""This will return a list of libcudnn.so; it's hard to tell which one is being used"""
if get_platform() == "win32":
cudnn_cmd = 'where /R "%CUDA_PATH%\bin" cudnn*.dll'
elif get_platform() == "darwin":
CUDA libraries and drivers can be found in /usr/local/cuda/. See
# https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
# https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
# Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
else:
cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
rc, out, _ = run_lambda(cudnn_cmd)
# find will return 1 if there are permission errors or if not found
if len(out) == 0 or (rc != 1 and rc != 0):
lib = os.environ.get("CUDNN_LIBRARY")
if lib is not None and os.path.isfile(lib):
return os.path.realpath(lib)
return None
files = set()
for fn in out.split("\n"):
fn = os.path.realpath(fn) # eliminate symbolic links
if os.path.isfile(fn):
files.add(fn)
if not files:
return None
# Alphabetize the result because the order is non-deterministic otherwise
files = list(sorted(files))
if len(files) == 1:
return files[0]
result = "\n".join(files)
return "Probably one of the following:\n{}".format(result)
def get_nvidia_smi():
Note: nvidia-smi is currently available only on Windows and Linux
if platform == "win32" or platform == "cygwin":
return get_windows_version(run_lambda)
if platform == "darwin":
version = get_mac_version(run_lambda)
if version is None:
return None
return "Mac OSX {}".format(version)
if platform == "linux":
# Ubuntu/Debian based
desc = get_lsb_version(run_lambda)
if desc is not None:
return desc
# Try reading /etc/*-release
desc = check_release_file(run_lambda)
if desc is not None:
return desc
return platform
# Unknown platform
return platform
env_info_fmt = """
DocTR version: {doctr_version}
TensorFlow version: {tf_version}
PyTorch version: {torch_version} (torchvision {torchvision_version})
OpenCV version: {cv2_version}
OS: {os}
Python version: {python_version}
Is CUDA available (TensorFlow): {is_cuda_available_tf}
Is CUDA available (PyTorch): {is_cuda_available_torch}
CUDA runtime version: {cuda_runtime_version}
GPU models and configuration: {nvidia_gpu_models}
Nvidia driver version: {nvidia_driver_version}
cuDNN version: {cudnn_version}
""".strip()
def pretty_str(envinfo):
def replace_nones(dct, replacement="Could not collect"):
for key in dct.keys():
if dct[key] is not None:
continue
dct[key] = replacement
return dct
def replace_bools(dct, true="Yes", false="No"):
for key in dct.keys():
if dct[key] is True:
dct[key] = true
elif dct[key] is False:
dct[key] = false
return dct
def maybe_start_on_next_line(string):
# If `string` is multiline, prepend a \n to it.
if string is not None and len(string.split("\n")) > 1:
return "\n{}\n".format(string)
return string
mutable_dict = envinfo._asdict()
# If nvidia_gpu_models is multiline, start on the next line
mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models)
# If the machine doesn't have CUDA, report some fields as 'No CUDA'
dynamic_cuda_fields = [
"cuda_runtime_version",
"nvidia_gpu_models",
"nvidia_driver_version",
]
all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields)
if TF_AVAILABLE and not any(tf.config.list_physical_devices("GPU")) and all_dynamic_cuda_fields_missing:
for field in all_cuda_fields:
mutable_dict[field] = "No CUDA"
# Replace True with Yes, False with No
mutable_dict = replace_bools(mutable_dict)
# Replace all None objects with 'Could not collect'
mutable_dict = replace_nones(mutable_dict)
return env_info_fmt.format(**mutable_dict)
def get_pretty_env_info():
"""Collects environment information for debugging purposes
Returns:
str: environment information
"""
return pretty_str(get_env_info())
Bug description
I use the DocTR to scan a PDF document for OCR. It's a large multi-page document with lots of weird characters. DocTR does a great job detecting and converting it to text. (It works shockingly well actually. ) Only problem, we've found is as follows:
Some sub-headings in our document are structured like: "This is the sub-title : --------" After the : the rest of the sentence is filled with ----- (This is to make sure you can't doctor the document after the fact.) We've tested about 100 documents that have this and DocTR structurally messes up all the sub-headings. It does detect and convert to text well but for some reason it puts the sub-headings at the end of text. The geometry coordinates are strange.
A good example is the sub-heading on line 9 of the text. It says: "De comparante verklaarde: --------".
I've attached an example PDF document so you guys can try it yourself. Splitsingsakte.pdf
If you have any questions feel free to reach out to me Maxvankekeren-it@outlook.com. I have many more testing documents if needs be.
Code snippet to reproduce the bug
It works great but again sub-headings ending with : ----- get's strange placement.
Error traceback
DocTR gives no error. It thinks it did a good job and 99% it did. The only issue is that the sub-headings mentioned earlier are in the complete wrong location.
Environment
Copyright (C) 2021-2022, Mindee.
This program is licensed under the Apache License 2.0.
See LICENSE or go to https://opensource.org/licenses/Apache-2.0 for full license details.
""" Based on https://github.com/pytorch/pytorch/blob/master/torch/utils/collect_env.py This script outputs relevant system environment info Run it with
python collect_env.py
. """from future import absolute_import, division, print_function, unicode_literals
import locale import os import re import subprocess import sys from collections import namedtuple
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
try: import doctr
except (ImportError, NameError, AttributeError, OSError): DOCTR_AVAILABLE = False
try: import tensorflow as tf
except (ImportError, NameError, AttributeError, OSError): TF_AVAILABLE = False
try: import torch
except (ImportError, NameError, AttributeError, OSError): TORCH_AVAILABLE = False
try: import torchvision
except (ImportError, NameError, AttributeError, OSError): TV_AVAILABLE = False
try: import cv2
except (ImportError, NameError, AttributeError, OSError): CV2_AVAILABLE = False
PY3 = sys.version_info >= (3, 0)
System Environment Information
SystemEnv = namedtuple( "SystemEnv", [ "doctr_version", "tf_version", "torch_version", "torchvision_version", "cv2_version", "os", "python_version", "is_cuda_available_tf", "is_cuda_available_torch", "cuda_runtime_version", "nvidia_driver_version", "nvidia_gpu_models", "cudnn_version", ], )
def run(command): """Returns (return-code, stdout, stderr)""" p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, err = p.communicate() rc = p.returncode if PY3: enc = locale.getpreferredencoding() output = output.decode(enc) err = err.decode(enc) return rc, output.strip(), err.strip()
def run_and_read_all(run_lambda, command): """Runs command using runlambda; reads and returns entire output if rc is 0""" rc, out, = run_lambda(command) if rc != 0: return None return out
def run_and_parse_first_match(run_lambda, command, regex): """Runs command using runlambda, returns the first regex match if it exists""" rc, out, = run_lambda(command) if rc != 0: return None match = re.search(regex, out) if match is None: return None return match.group(1)
def get_nvidia_driver_version(run_lambda): if get_platform() == "darwin": cmd = "kextstat | grep -i cuda" return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA ([)]") smi = get_nvidia_smi() return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
def get_gpu_info(run_lambda): if get_platform() == "darwin": if TF_AVAILABLE and any(tf.config.list_physical_devices("GPU")): return tf.config.list_physical_devices("GPU")[0].name return None smi = get_nvidia_smi() uuidregex = re.compile(r" (UUID: .+?)") rc, out, = run_lambda(smi + " -L") if rc != 0: return None
Anonymize GPUs by removing their UUID
def get_running_cuda_version(run_lambda): return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
def get_cudnn_version(run_lambda): """This will return a list of libcudnn.so; it's hard to tell which one is being used""" if get_platform() == "win32": cudnn_cmd = 'where /R "%CUDA_PATH%\bin" cudnn*.dll' elif get_platform() == "darwin":
CUDA libraries and drivers can be found in /usr/local/cuda/. See
def get_nvidia_smi():
Note: nvidia-smi is currently available only on Windows and Linux
def get_platform(): if sys.platform.startswith("linux"): return "linux" elif sys.platform.startswith("win32"): return "win32" elif sys.platform.startswith("cygwin"): return "cygwin" elif sys.platform.startswith("darwin"): return "darwin" else: return sys.platform
def get_mac_version(run_lambda): return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
def get_windows_version(run_lambda): return run_and_read_all(run_lambda, "wmic os get Caption | findstr /v Caption")
def get_lsb_version(run_lambda): return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)")
def check_release_file(run_lambda): return run_and_parse_first_match(run_lambda, "cat /etc/-release", r'PRETTY_NAME="(.)"')
def get_os(run_lambda): platform = get_platform()
def get_env_info(): run_lambda = run
env_info_fmt = """ DocTR version: {doctr_version} TensorFlow version: {tf_version} PyTorch version: {torch_version} (torchvision {torchvision_version}) OpenCV version: {cv2_version} OS: {os} Python version: {python_version} Is CUDA available (TensorFlow): {is_cuda_available_tf} Is CUDA available (PyTorch): {is_cuda_available_torch} CUDA runtime version: {cuda_runtime_version} GPU models and configuration: {nvidia_gpu_models} Nvidia driver version: {nvidia_driver_version} cuDNN version: {cudnn_version} """.strip()
def pretty_str(envinfo): def replace_nones(dct, replacement="Could not collect"): for key in dct.keys(): if dct[key] is not None: continue dct[key] = replacement return dct
def get_pretty_env_info(): """Collects environment information for debugging purposes Returns: str: environment information """ return pretty_str(get_env_info())
def main(): print("Collecting environment information...\n") output = get_pretty_env_info() print(output)
if name == "main": main()
Deep Learning backend
is_tf_available: True is_torch_available: True