Open bandrel opened 1 year ago
@bandrel Would you be interested in working on this issue? I can knock it out but haven't taken a look into this yet!
Dont really have the time to work on it at the moment, but this should help.
def _pdf_extract(self, pdffile: str) -> None:
"""
Extract jpg images from pdf files and save them to temp directory.
pdf_extract is used by the run() method and not be called directly in most
circumstances.
Arguments:
pdffile -- A string file path pointing to a PDF
"""
self.logger.info('Opening %s and extracting JPG images' % pdffile)
with open(pdffile, "rb") as file:
pdf = file.read()
startmark = b"\xff\xd8"
startfix = 0
endmark = b"\xff\xd9"
endfix = 2
i = 0
njpg = 0
while True:
istream = pdf.find(b"stream", i)
if istream < 0:
break
istart = pdf.find(startmark, istream, istream + 20)
if istart < 0:
i = istream + 20
continue
iend = pdf.find(b"endstream", istart)
if iend < 0:
raise Exception("Didn't find end of stream!")
iend = pdf.find(endmark, iend - 20)
if iend < 0:
raise Exception("Didn't find end of JPG!")
istart += startfix
iend += endfix
jpg = pdf[istart:iend]
self.logger.debug('Creating temporary file ' + self.tempdir.name + "/jpg%d.jpg" % njpg)
with open(self.tempdir.name + "/jpg%d.jpg" % njpg, "wb") as jpgfile:
jpgfile.write(jpg)
njpg += 1
i = iend****
That will get you the extracted JPEG which is the first step. This second one should get you the text extracted from the jpeg.
def _process_image(self, yara_rule: str, save_context: bool) -> None:
"""
Perform OCR and yara rule matching as a worker.
process_image() is used by the run() method to create multiple worker processes for
parallel execution. process_image normally will not be called directly.
Arguments:
yara_rule -- File path pointing to a Yara rule file
"""
context = None
handler = colorlog.StreamHandler()
handler.setFormatter(colorlog.ColoredFormatter(
'%(log_color)s%(levelname)s:%(name)s:%(message)s'))
# Creates a logger object for the individual workers that contains the PID as part of the message header
worker_logger = colorlog.getLogger('worker_'+str(os.getpid()))
worker_logger.addHandler(handler)
worker_logger.setLevel(self.logger.level)
worker_logger.info('PID {0} created to process queue'.format(str(os.getpid())))
while True:
try:
image, filepath = self.q.get(timeout=.25)
except Empty:
if self.total_added_to_queue[0] == self.total_items_to_queue[0]:
worker_logger.debug('Queue Empty PID %d exiting' % os.getpid())
return
else:
worker_logger.debug('Queue still loading')
continue
ocrtext = tesserocr.image_to_text(image)
Checkout this project. it should be modular enough to just plug in https://github.com/bandrel/OCyara