extremely low efficiency FPDF._out implement

YouJiacheng commented 3 years ago

In fpdf（not fpdf2），self.buffer is str not bytearray

def _out(self, s):
    #Add a line to the document
    if PY3K and isinstance(s, bytes):
        # manage binary data as latin1 until PEP461-like function is implemented
        s = s.decode("latin1")          
    elif not PY3K and isinstance(s, unicode):
        s = s.encode("latin1")    # default encoding (font name and similar)      
    elif not isinstance(s, basestring):
        s = str(s)
    if(self.state==2):
        self.pages[self.page]+=s+"\n"
    else:
        self.buffer+=s+"\n" # type(self.buffer) == str, which leads to O(N^2) time !!!!!!!!!!!!

YouJiacheng commented 3 years ago

I fix it by subclassing FPDF, others encounter same problem can use following code

from fpdf import FPDF

# still use str concat within each page
class FPDF_fixed1(FPDF):
    def __init__(self, orientation='P', unit='mm', format='A4'):
        super().__init__(orientation=orientation, unit=unit, format=format)
        self.buffer = bytearray()

    def _out(self, s):
        if(self.state == 2):
            # still use str concat within each page
            if isinstance(s, bytes):
                s = s.decode('latin1')
            elif not isinstance(s, str):
                s = str(s)
            self.pages[self.page] += s + '\n'
        else:
            if not isinstance(s, bytes):
                if not isinstance(s, str):
                    s = str(s)
                s = s.encode('latin1')
            self.buffer += s + b'\n'

    def output(self, name=''):
        if(self.state < 3):
            self.close()
        with open(name, 'wb') as f:
            f.write(self.buffer)

# fully bytearray version, but not support compression and page number
# you can override _putpages to re-support compression and page number 
class FPDF_fixed2(FPDF):
    def __init__(self, orientation='P', unit='mm', format='A4'):
        super().__init__(orientation=orientation, unit=unit, format=format)
        self.buffer = bytearray()

    def _out(self, s):
        if not isinstance(s, bytes):
            if not isinstance(s, str):
                s = str(s)
            s = s.encode('latin1')
        if(self.state == 2):
            self.pages[self.page] += s + b'\n'
        else:
            self.buffer += s + b'\n'

    def output(self, name=''):
        if(self.state < 3):
            self.close()
        with open(name, 'wb') as f:
            f.write(self.buffer)

    def _beginpage(self, orientation):
        super()._beginpage(orientation)
        self.pages[self.page] = bytearray()

    def set_compression(self, compress): # disable
        return super().set_compression(False)

    def alias_nb_pages(self, alias): # disable
        pass

YouJiacheng commented 3 years ago

after fix that problem, time cost for converting ~500 ~1M jpg to pdf is reduced from 1200s() to 3s

: I only measure time cost for converting 50 * ~1M jpg to pdf, which is 12s

Lucas-C commented 3 years ago

PyFPDF is not maintained anymore, you may want to check PyFPDF/fpdf2

Its FPDF._out method is a lot faster thanks to the usage of a bytebuffer

reingart / pyfpdf

extremely low efficiency FPDF._out implement #185