Open vincentaxhe opened 5 months ago
我用之前的版本,当尝试建立下级目录,需要想出精巧的正则公式,这就overkill了,我想要tab来指示层级。不知道现在是不是还忽略行首tab的。
#!/bin/python import os import re import sys from collections import defaultdict from pypdf import PdfWriter, PdfReader class Pdf(object): def __init__(self, path): self.path = path reader = PdfReader(open(path, "rb"), strict=False) self.writer = PdfWriter() self.writer.append(reader) self.writer._root_object.pop("/Outlines", None) @property def _new_path(self): name, ext = os.path.splitext(self.path) return name + '_new' + ext def add_bookmark(self, title, pagenum, parent=None): return self.writer.add_outline_item(title, pagenum, parent=parent) def save_pdf(self): if os.path.exists(self._new_path): os.remove(self._new_path) with open(self._new_path, 'wb') as out: self.writer.write(out) return self._new_path def _add_bookmark(pdf, index_dict): if not index_dict: return None m = max(index_dict.keys()) parent_dict = {} # {parent index:IndirectObject} for i in range(m+1): value = index_dict[i] inobject = pdf.add_bookmark(value['title'], value['pagenum'] - 1, parent_dict.get(value.get('parent'))) parent_dict[i] = inobject def add_bookmark(path, index_dict): pdf = Pdf(path) _add_bookmark(pdf, index_dict) return pdf.save_pdf() def toc_reader(path, gap): pattern = re.compile(r'^(\t*)([^\t]+)\t(\d+)$') tocdict = {} levels = defaultdict(list) lastpagenum = 0 with open(path, 'r') as toc: for line, item in enumerate(toc): content = pattern.search(item) assert content, f"line {line}:{item} line ill-formatted" indent, title, pagenum = content.group(1, 2, 3) pagenum = int(pagenum) + int(gap) assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong" tocdict[line] = {'title': title, 'pagenum': pagenum} levels[len(indent)].append(line) if len(indent) > 0: tocdict[line]['parent'] = levels[len(indent) - 1][-1] lastpagenum = pagenum return tocdict if __name__ == '__main__': file, toc, gap = sys.argv[1:] index_dict = toc_reader(toc, gap) add_bookmark(file, index_dict)
使用pdfbookmark.py xxx.pdf toc 10来运行它,toc用tab缩进来分级
新版本支持空格分层了,不过脚本写的挺好的
我用之前的版本,当尝试建立下级目录,需要想出精巧的正则公式,这就overkill了,我想要tab来指示层级。不知道现在是不是还忽略行首tab的。
使用pdfbookmark.py xxx.pdf toc 10来运行它,toc用tab缩进来分级