chroming / pdfdir

PDF导航(大纲/目录)添加工具
GNU General Public License v3.0
508 stars 54 forks source link

这个有点overkill了,我简化成一个脚本 #36

Open vincentaxhe opened 1 month ago

vincentaxhe commented 1 month ago

我用之前的版本,当尝试建立下级目录,需要想出精巧的正则公式,这就overkill了,我想要tab来指示层级。不知道现在是不是还忽略行首tab的。

#!/bin/python
import os
import re
import sys
from collections import defaultdict
from pypdf import PdfWriter, PdfReader

class Pdf(object):
    def __init__(self, path):
        self.path = path
        reader = PdfReader(open(path, "rb"), strict=False)
        self.writer = PdfWriter()
        self.writer.append(reader)
        self.writer._root_object.pop("/Outlines", None)

    @property
    def _new_path(self):
        name, ext = os.path.splitext(self.path)
        return name + '_new' + ext

    def add_bookmark(self, title, pagenum, parent=None):
        return self.writer.add_outline_item(title, pagenum, parent=parent)

    def save_pdf(self):
        if os.path.exists(self._new_path):
            os.remove(self._new_path)
        with open(self._new_path, 'wb') as out:
            self.writer.write(out)
        return self._new_path

def _add_bookmark(pdf, index_dict):
    if not index_dict:
        return None
    m = max(index_dict.keys())
    parent_dict = {}  # {parent index:IndirectObject}
    for i in range(m+1):
        value = index_dict[i]
        inobject = pdf.add_bookmark(value['title'], 
                                    value['pagenum'] - 1, 
                                    parent_dict.get(value.get('parent')))
        parent_dict[i] = inobject

def add_bookmark(path, index_dict):
    pdf = Pdf(path)
    _add_bookmark(pdf, index_dict)
    return pdf.save_pdf()

def toc_reader(path, gap):
    pattern = re.compile(r'^(\t*)([^\t]+)\t(\d+)$')
    tocdict = {}
    levels = defaultdict(list)
    lastpagenum = 0
    with open(path, 'r') as toc:
        for line, item in enumerate(toc):
            content = pattern.search(item)
            assert content, f"line {line}:{item} line ill-formatted"
            indent, title, pagenum = content.group(1, 2, 3)
            pagenum = int(pagenum) + int(gap)
            assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong"
            tocdict[line] = {'title': title, 'pagenum': pagenum}
            levels[len(indent)].append(line)
            if len(indent) > 0:
                tocdict[line]['parent'] = levels[len(indent) - 1][-1]
            lastpagenum = pagenum
    return tocdict
if __name__ == '__main__':
    file, toc, gap = sys.argv[1:]
    index_dict = toc_reader(toc, gap)
    add_bookmark(file, index_dict)

使用pdfbookmark.py xxx.pdf toc 10来运行它,toc用tab缩进来分级

chroming commented 1 month ago

新版本支持空格分层了,不过脚本写的挺好的