py-pdf / pypdf

A pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files
https://pypdf.readthedocs.io/en/latest/
Other
8.15k stars 1.39k forks source link

Chinese PDF add watermark Chinese garbled (picture, text watermark will) #526

Closed ftconan closed 2 years ago

ftconan commented 4 years ago

I use reportlab to generate Chinese PDF, the font uses Microsoft Yahei, and PyPDF2 is used to add watermark to the original PDF garbled. How do I solve it?Thank you very much.(python3) @jerem @jasonbot @wolever @josephw @vfigueiro

"""
@author: magician
@file:   pdf_tools.py
@date:   2019/11/26
"""
import os

from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm

from pdf_file import pdf_dir

def _create_text_mark(content):
    """
    创建文字水印
    :param content: 文字
    :return:
    """
    # 默认大小为21cm*29.7cm
    mark_path = os.path.join(pdf_dir, 'mark.pdf')
    c = canvas.Canvas(mark_path, pagesize=(30 * cm, 30 * cm))
    # 移动坐标原点(坐标系左下为(0,0))
    c.translate(10 * cm, 5 * cm)

    # 设置字体
    # c.setFont("hei", 80)
    # 指定描边的颜色
    c.setStrokeColorRGB(0, 1, 0)
    # 指定填充颜色
    c.setFillColorRGB(0, 1, 0)
    # 画一个矩形
    c.rect(cm, cm, 7 * cm, 17 * cm, fill=1)

    # 旋转45度,坐标系被旋转
    c.rotate(45)
    # 指定填充颜色
    c.setFillColorRGB(0.6, 0, 0)
    # 设置透明度,1为不透明
    c.setFillAlpha(0.3)
    # 画几个文本,注意坐标系旋转的影响
    c.drawString(3 * cm, 0 * cm, content)
    c.setFillAlpha(0.6)
    c.drawString(6 * cm, 3 * cm, content)
    c.setFillAlpha(1)
    c.drawString(9 * cm, 6 * cm, content)

    c.save()

    return mark_path

def _create_pic_mark(pic_name):
    """
    创建图片水印
    :param  pic_name: 图片名称
    :return:
    """
    pic_path = os.path.join(pdf_dir, pic_name)
    mark_path = os.path.join(pdf_dir, 'mark.pdf')
    c = canvas.Canvas(mark_path, pagesize=(30 * cm, 30 * cm))
    # 设置透明度
    c.setFillAlpha(0.5)
    c.drawImage(pic_path, 7 * cm, 7 * cm, width=6 * cm, height=6 * cm)
    c.save()

    return mark_path

def add_watermark(mark_type, mark_data, input_pdf):
    """
    添加水印
    :param mark_type:   text: 文字水印  picture: 图片水印
    :param mark_data:   水印数据  text: 文本   picture: 图片名称
    :param input_pdf:   输入PDF
    :return:
    """
    # 写入PDF
    output_pdf = PdfFileWriter()

    # 1.生成水印
    if mark_type == 'text':
        mark_path = _create_text_mark(mark_data)
    elif mark_type == 'picture':
        mark_path = _create_pic_mark(mark_data)
    else:
        return {'message': 'PDF水印类型错误!'}, 400

    # 2.读入水印pdf文件
    watermark_pdf = PdfFileReader(open(mark_path, 'rb'))

    # 3.读取PDF
    pdf_reader = PdfFileReader(open(input_pdf, 'rb'))

    # PDF文件被加密了
    if pdf_reader.getIsEncrypted():
        try:
            # 尝试用空密码解密
            pdf_reader.decrypt('')
        except Exception as e:
            print(e)
            return {'message': 'PDF解密失败!'}, 400

    # 给每一页打水印
    page_num = pdf_reader.getNumPages()
    for i in range(page_num):
        page = pdf_reader.getPage(i).extractText()
        print(type(page))
        print(page.encode('utf-8').decode('utf-8'))
        page.mergePage(watermark_pdf.getPage(0))
        # 压缩内容
        # page.compressContentStreams()
        output_pdf.addPage(page)

    # 4.合成PDF
    with open(input_pdf, "wb") as file:
        output_pdf.write(file)

    return True
MartinThoma commented 2 years ago

Thank you for reporting an issue!

However, this script has at least two issues:

  1. Missing main: It's only function definitions, no function call
  2. At least one wrong usage of PyPDF2:
        page = pdf_reader.getPage(i).extractText()  # <---- now 'page' is a string
        print(type(page))
        print(page.encode('utf-8').decode('utf-8'))
        page.mergePage(watermark_pdf.getPage(0)) # <--- strings dont have a mergePage method

As it has been a long time, I doubt that you can still remember what was wrong. If you encounter another issue, please test it again with a recent PyPDF2 version and file a new issue.