Implement OCR for images

bandrel commented 1 year ago

Checkout this project. it should be modular enough to just plug in https://github.com/bandrel/OCyara

RoseSecurity commented 1 year ago

@bandrel Would you be interested in working on this issue? I can knock it out but haven't taken a look into this yet!

bandrel commented 1 year ago

Dont really have the time to work on it at the moment, but this should help.

    def _pdf_extract(self, pdffile: str) -> None:
        """
        Extract jpg images from pdf files and save them to temp directory.

        pdf_extract is used by the run() method and not be called directly in most
        circumstances.

        Arguments:
            pdffile -- A string file path pointing to a PDF
        """
        self.logger.info('Opening %s and extracting JPG images' % pdffile)
        with open(pdffile, "rb") as file:
            pdf = file.read()

        startmark = b"\xff\xd8"
        startfix = 0
        endmark = b"\xff\xd9"
        endfix = 2
        i = 0
        njpg = 0
        while True:
            istream = pdf.find(b"stream", i)
            if istream < 0:
                break
            istart = pdf.find(startmark, istream, istream + 20)
            if istart < 0:
                i = istream + 20
                continue
            iend = pdf.find(b"endstream", istart)
            if iend < 0:
                raise Exception("Didn't find end of stream!")
            iend = pdf.find(endmark, iend - 20)
            if iend < 0:
                raise Exception("Didn't find end of JPG!")

            istart += startfix
            iend += endfix
            jpg = pdf[istart:iend]
            self.logger.debug('Creating temporary file ' + self.tempdir.name + "/jpg%d.jpg" % njpg)
            with open(self.tempdir.name + "/jpg%d.jpg" % njpg, "wb") as jpgfile:
                jpgfile.write(jpg)
            njpg += 1
            i = iend****

That will get you the extracted JPEG which is the first step. This second one should get you the text extracted from the jpeg.

    def _process_image(self, yara_rule: str, save_context: bool) -> None:
        """
        Perform OCR and yara rule matching as a worker.

        process_image() is used by the run() method to create multiple worker processes for
        parallel execution.  process_image normally will not be called directly.

        Arguments:
            yara_rule -- File path pointing to a Yara rule file
        """
        context = None
        handler = colorlog.StreamHandler()
        handler.setFormatter(colorlog.ColoredFormatter(
                '%(log_color)s%(levelname)s:%(name)s:%(message)s'))
        # Creates a logger object for the individual workers that contains the PID as part of the message header
        worker_logger = colorlog.getLogger('worker_'+str(os.getpid()))
        worker_logger.addHandler(handler)
        worker_logger.setLevel(self.logger.level)
        worker_logger.info('PID {0} created to process queue'.format(str(os.getpid())))
        while True:
            try:
                image, filepath = self.q.get(timeout=.25)
            except Empty:
                if self.total_added_to_queue[0] == self.total_items_to_queue[0]:
                    worker_logger.debug('Queue Empty PID %d exiting' % os.getpid())
                    return
                else:
                    worker_logger.debug('Queue still loading')
                    continue
            ocrtext = tesserocr.image_to_text(image)

RoseSecurity / ScrapPY

Implement OCR for images #2