Open jwilk opened 6 years ago
I had this patch to add -j/--jobs to djvu2hocr lying around:
-j
--jobs
diff --git a/lib/cli/djvu2hocr.py b/lib/cli/djvu2hocr.py --- a/lib/cli/djvu2hocr.py +++ b/lib/cli/djvu2hocr.py @@ -12,6 +12,8 @@ # General Public License for more details. import argparse +import functools +import itertools import locale import os import re @@ -44,6 +46,7 @@ argparse.ArgumentParser.__init__(self, usage=usage) self.add_argument('-v', '--version', action='version', version=version, help='show version information and exit') self.add_argument('-p', '--pages', dest='pages', action='store', default=None, help='pages to convert') + self.add_argument('-j', '--jobs', dest='n_jobs', metavar='N', nargs='?', type=int, default=1, help='number of jobs to run simultaneously') self.add_argument('path', metavar='FILE', help='DjVu file to process') group = self.add_argument_group(title='word segmentation options') group.add_argument('--word-segmentation', dest='word_segmentation', choices=('simple', 'uax29'), default='space', help='word segmentation algorithm') @@ -62,6 +65,8 @@ else: options.icu = None options.locale = None + if options.n_jobs is None: + options.n_jobs = utils.get_cpu_count() return options class CharacterLevelDetails(Exception): @@ -280,6 +285,21 @@ </html> ''' +def extract_page_zones(djvused, page_iterator, options): + for n in page_iterator: + try: + page_size = [ + int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1]) + for i in xrange(2) + ] + options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1]) + page_text = sexpr.Expression.from_stream(djvused.stdout) + except sexpr.ExpressionSyntaxError: + break + logger.info('- Page #%d', n) + page_zone = Zone(page_text, page_size[1]) + yield page_zone + def main(argv=sys.argv): options = ArgumentParser().parse_args(argv[1:]) logger.info('Converting %s:' % utils.smart_repr(options.path, system_encoding)) @@ -307,19 +327,24 @@ ocr_system='djvu2hocr %s' % __version__, ocr_capabilities=' '.join(hocr.djvu2hocr_capabilities) )) - for n in page_iterator: + page_zones = extract_page_zones(djvused, page_iterator, options) + if options.n_jobs <= 1: + do_process_page = functools.partial(process_page, options=options) + process = itertools.imap(do_process_page, page_zones) + pool = None + else: try: - page_size = [ - int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1]) - for i in xrange(2) - ] - options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1]) - page_text = sexpr.Expression.from_stream(djvused.stdout) - except sexpr.ExpressionSyntaxError: - break - logger.info('- Page #%d', n) - page_zone = Zone(page_text, page_size[1]) - process_page(page_zone, options) + import multiprocessing + except ImportError: + # TODO + raise + do_process_page = functools.partial(process_page, options=options) + pool = multiprocessing.Pool(options.n_jobs) + process = pool.imap(do_process_page, page_zones, chunksize=8) + for none in process: + pass + if pool is not None: + pool.close() sys.stdout.write(hocr_footer) djvused.wait()
It no longer applies cleanly, and I don't even remember if it worked correctly at all. :-/
I had this patch to add
-j
/--jobs
to djvu2hocr lying around:It no longer applies cleanly, and I don't even remember if it worked correctly at all. :-/