nguyenq / tess4j

Java JNA wrapper for Tesseract OCR API
Apache License 2.0
1.6k stars 373 forks source link

Tesseract.doOCR() failing on Windows from cmd line for Java 17, works in eclipse #237

Closed ralphcook closed 2 years ago

ralphcook commented 2 years ago

I use Tess4j in a Java desktop app running on Windows 10. I provide the app for free for organizations working with poor people trying to get their driving licenses restored. It's been working fine with Java 8.

Now I'm trying to get it to work with more recent versions of Java, and it fails when making the doOCR call. I've written two classes that illustrate the problem:

        package sandbox.ocrtest;

        import java.io.File;

        public class TTest
        {
          public static void main(String [] arguments)
          {
            TTest tTest = new TTest();
            tTest.go(arguments);
          }

          public void go(String[] arguments)
          {
            File file = new File("c:\\Users\\Ralph\\files\\programming\\workspaces\\DrivingRecordTool\\DrivingRecordTool\\testFiles\\LegalAid\\pdf\\NCDOT_Record02-28-2022 - clientname.pdf");
            PDFTextReader reader = new PDFTextReader();
            String text = reader.getOCRText(file);
            System.out.println(text);
            System.out.println("==TTest done.==");
          }
        }

and:

        package sandbox.ocrtest;

        import java.awt.image.BufferedImage;
        import java.io.File;

        import org.apache.pdfbox.pdmodel.PDDocument;
        import org.apache.pdfbox.rendering.ImageType;
        import org.apache.pdfbox.rendering.PDFRenderer;

        import net.sourceforge.tess4j.Tesseract;

        public class PDFTextReader
        {
          public String getText(File pdfFile)
          {
            String result = getOCRText(pdfFile);
            return result;
          }

          private void reportPage(int i) { System.out.println("Page " + i); }

          private static void say(String msg) { System.out.println(msg); }

          public String getOCRText(File pdfFile)
          {
            PDDocument document = null;
            StringBuffer result = new StringBuffer();

            try
            {
              document = PDDocument.load(pdfFile);
              PDFRenderer pdfRenderer = new PDFRenderer(document);

              Tesseract tesseract = new Tesseract();
              tesseract.setDatapath("C:\\Program Files\\Tesseract-OCR\\tessdata\\");
              tesseract.setLanguage("eng");
              tesseract.setPageSegMode(6);

              int numberOfPages = document.getNumberOfPages();

              boolean fatalReadError = false;

              int pageIndex = 0;
              while (pageIndex < numberOfPages && !fatalReadError)
              {
                int pageNumber = pageIndex + 1;
                reportPage(pageNumber);
                BufferedImage buffImage = pdfRenderer.renderImageWithDPI(pageIndex, 300, ImageType.RGB);
                tesseract.setVariable("user_defined_dpi", "300");
                tesseract.setVariable("tessedit_write_image", "true");
                String pageResult = tesseract.doOCR(buffImage);
                if (pageResult == null)
                {
                  say("Null result from OCR, page = " + pageNumber);
                }
                else
                {
                  result.append(pageResult);
                  pageIndex++;
                }
              }
            }
            catch(Throwable t) { t.printStackTrace(); }

            return result.toString();
          }
        }

I use Maven to get the 5.4 version of Tess4j, here is the pom.xml:

        <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
          <modelVersion>4.0.0</modelVersion>
          <groupId>org.rc</groupId>
          <artifactId>tess4jtest</artifactId>
          <version>0.1-SNAPSHOT</version>
          <packaging>jar</packaging> 
          <name>Tess4j test</name>
          <description>Test invocation of Tess4j</description>
          <properties>
            <startclass>sandbox.ocrtest.TTest</startclass>
          </properties>  

          <build>
              <plugins>
                  <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.0</version>
                    <!--
                    <configuration>
                      <source>1.8</source>
                      <target>1.8</target>
                    </configuration>
                    -->
                  </plugin>
                  <plugin>
                    <!-- build an executable jar -->
                    <artifactId>maven-assembly-plugin</artifactId>
                    <executions>
                        <execution>
                            <phase>package</phase>
                            <goals><goal>single</goal></goals>
                            <configuration>
                                <archive>
                                    <manifest>
                                        <mainClass>sandbox.ocrtest.TTest</mainClass>
                                    </manifest>
                                </archive>
                                <descriptorRefs>
                                    <descriptorRef>jar-with-dependencies</descriptorRef>
                                </descriptorRefs>
                            </configuration>
                        </execution>
                    </executions>
                  </plugin>
              </plugins>
          </build>

          <dependencies>
            <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
            <dependency>
                <groupId>net.sourceforge.tess4j</groupId>
                <artifactId>tess4j</artifactId>
                <version>5.4.0</version>
            </dependency>
          </dependencies>

        </project>

I have Tesseract-OCR v5.2.0.20220712 installed on the machine where this test is run.

When I run the program in eclipse, configured to use the Java 17 runtime, I get the following output:

        SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
        SLF4J: Defaulting to no-operation (NOP) logger implementation
        SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
        Page 1
        Sep 27, 2022 12:07:45 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
        WARNING: Using fallback font 'CourierNewPS-BoldMT' for 'CourierNewBold'
        Page 2
        Page 3
        Page 4
        Page 5
        NORTH CAROLINA DIVISION OF MOTOR VEHICLES
        RDLSI/DRIVING RECORD CHECK
        ... (rest of the text from the PDF OCR operation) ...
        ==TTest done.==

When I run the program from a Windows CMD line on the same machine, from a Runnable Jar exported from the same eclipse project, I get the following:

        C:\Users\Ralph\files\programming\workspaces\tess4j\tess4j\target>java -jar TTest.jar
        SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
        SLF4J: Defaulting to no-operation (NOP) logger implementation
        SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
        Page 1
        Sep 27, 2022 12:05:27 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
        WARNING: Using fallback font 'CourierNewPS-BoldMT' for 'CourierNewBold'
        java.lang.ExceptionInInitializerError
                at net.sourceforge.tess4j.Tesseract.init(Tesseract.java:442)
                at net.sourceforge.tess4j.Tesseract.doOCR(Tesseract.java:326)
                at net.sourceforge.tess4j.Tesseract.doOCR(Tesseract.java:309)
                at net.sourceforge.tess4j.Tesseract.doOCR(Tesseract.java:290)
                at net.sourceforge.tess4j.Tesseract.doOCR(Tesseract.java:274)
                at sandbox.ocrtest.PDFTextReader.getOCRText(PDFTextReader.java:51)
                at sandbox.ocrtest.TTest.go(TTest.java:17)
                at sandbox.ocrtest.TTest.main(TTest.java:10)
                at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
                at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
                at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
                at java.base/java.lang.reflect.Method.invoke(Method.java:568)
                at org.eclipse.jdt.internal.jarinjarloader.JarRsrcLoader.main(JarRsrcLoader.java:61)
        Caused by: java.lang.IllegalStateException: zip file closed
                at java.base/java.util.zip.ZipFile.ensureOpen(ZipFile.java:831)
                at java.base/java.util.zip.ZipFile.getEntry(ZipFile.java:330)
                at java.base/java.util.jar.JarFile.getEntry(JarFile.java:518)
                at java.base/sun.net.www.protocol.jar.URLJarFile.getEntry(URLJarFile.java:131)
                at java.base/java.util.jar.JarFile.getJarEntry(JarFile.java:473)
                at java.base/jdk.internal.loader.URLClassPath$JarLoader.getResource(URLClassPath.java:954)
                at java.base/jdk.internal.loader.URLClassPath.getResource(URLClassPath.java:319)
                at java.base/java.net.URLClassLoader$1.run(URLClassLoader.java:424)
                at java.base/java.net.URLClassLoader$1.run(URLClassLoader.java:421)
                at java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
                at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:420)
                at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
                at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
                at net.sourceforge.tess4j.util.LoadLibs.getTessAPIInstance(LoadLibs.java:83)
                at net.sourceforge.tess4j.TessAPI.<clinit>(TessAPI.java:42)
                ... 13 more

        ==TTest done.==

        C:\Users\Ralph\files\programming\workspaces\tess4j\tess4j\target>

If I uninstall Java 17 and install Java 8, things work as expected.

Am I doing something wrong? Is Tess4j not supported for Java versions past 8, or for Windows 10?

PDFTextReader.java.txt TTest.java.txt

ralphcook commented 2 years ago

I have chased the problem down further; Java 17 does not have the com.sun.jna package included in the java runtime, though Java 8 does. Therefore Tess4j fails because it needs com.sun.jna.Native and com.sun.jna.Platform. I tried adding a dependency to the net.java.dev.jna, which is what is given for this package. This did put the jna-5.12.1.jar in the Maven dependencies and in the application jar, but the command-line invocation on that jar still fails the same way.

ralphcook commented 2 years ago

It turns out I had an improper build of my jar file; once I fixed that, compiling my application for Java 8 enabled it to run on a later version. User error.