jwilk-archive / ocrodjvu

OCR for DjVu
GNU General Public License v2.0
45 stars 19 forks source link

parallel mode for djvu2hocr #25

Open jwilk opened 6 years ago

jwilk commented 6 years ago

I had this patch to add -j/--jobs to djvu2hocr lying around:

diff --git a/lib/cli/djvu2hocr.py b/lib/cli/djvu2hocr.py
--- a/lib/cli/djvu2hocr.py
+++ b/lib/cli/djvu2hocr.py
@@ -12,6 +12,8 @@
 # General Public License for more details.

 import argparse
+import functools
+import itertools
 import locale
 import os
 import re
@@ -44,6 +46,7 @@
         argparse.ArgumentParser.__init__(self, usage=usage)
         self.add_argument('-v', '--version', action='version', version=version, help='show version information and exit')
         self.add_argument('-p', '--pages', dest='pages', action='store', default=None, help='pages to convert')
+        self.add_argument('-j', '--jobs', dest='n_jobs', metavar='N', nargs='?', type=int, default=1, help='number of jobs to run simultaneously')
         self.add_argument('path', metavar='FILE', help='DjVu file to process')
         group = self.add_argument_group(title='word segmentation options')
         group.add_argument('--word-segmentation', dest='word_segmentation', choices=('simple', 'uax29'), default='space', help='word segmentation algorithm')
@@ -62,6 +65,8 @@
         else:
             options.icu = None
             options.locale = None
+        if options.n_jobs is None:
+            options.n_jobs = utils.get_cpu_count()
         return options

 class CharacterLevelDetails(Exception):
@@ -280,6 +285,21 @@
 </html>
 '''

+def extract_page_zones(djvused, page_iterator, options):
+    for n in page_iterator:
+        try:
+            page_size = [
+                int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1])
+                for i in xrange(2)
+            ]
+            options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1])
+            page_text = sexpr.Expression.from_stream(djvused.stdout)
+        except sexpr.ExpressionSyntaxError:
+            break
+        logger.info('- Page #%d', n)
+        page_zone = Zone(page_text, page_size[1])
+        yield page_zone
+
 def main(argv=sys.argv):
     options = ArgumentParser().parse_args(argv[1:])
     logger.info('Converting %s:' % utils.smart_repr(options.path, system_encoding))
@@ -307,19 +327,24 @@
             ocr_system='djvu2hocr %s' % __version__,
             ocr_capabilities=' '.join(hocr.djvu2hocr_capabilities)
     ))
-    for n in page_iterator:
+    page_zones = extract_page_zones(djvused, page_iterator, options)
+    if options.n_jobs <= 1:
+        do_process_page = functools.partial(process_page, options=options)
+        process = itertools.imap(do_process_page, page_zones)
+        pool = None
+    else:
         try:
-            page_size = [
-                int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1])
-                for i in xrange(2)
-            ]
-            options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1])
-            page_text = sexpr.Expression.from_stream(djvused.stdout)
-        except sexpr.ExpressionSyntaxError:
-            break
-        logger.info('- Page #%d', n)
-        page_zone = Zone(page_text, page_size[1])
-        process_page(page_zone, options)
+            import multiprocessing
+        except ImportError:
+            # TODO
+            raise
+        do_process_page = functools.partial(process_page, options=options)
+        pool = multiprocessing.Pool(options.n_jobs)
+        process = pool.imap(do_process_page, page_zones, chunksize=8)
+    for none in process:
+        pass
+    if pool is not None:
+        pool.close()
     sys.stdout.write(hocr_footer)
     djvused.wait()

It no longer applies cleanly, and I don't even remember if it worked correctly at all. :-/