phpdave11 / gofpdi

Go Free PDF Document Importer
MIT License
118 stars 59 forks source link

A weird problem: default pdf text language changed after importing pdf page #43

Open kaiceo opened 3 years ago

kaiceo commented 3 years ago
import (
          "github.com/phpdave11/gofpdf"
          "github.com/phpdave11/gofpdi"
      )

var FontType string = "NotoSansSC-Regular.ttf"

func NewPdf() *gofpdf.Fpdf {
    pdf := gofpdf.New("P", "pt", "A4", "")
 //chinese font add
    pdf.AddUTF8Font(FontType, "", "static/font/"+FontType)
    return pdf
}

var imp = gofpdi.NewImporter()
func ImportPdfPages(pdf *gofpdf.Fpdf, pdffile string) *gofpdf.Fpdf {
    imp.SetSourceFile(pdffile)
    pageSizes := imp.GetPageSizes()
    total := len(pageSizes)

  pdfReader := imp.GetReader()

  for i := 1; i <= total; i++ {
      rotation, _ := pdfReader.GetPageRotation(i)
      curWidth := pageSizes[i]["/MediaBox"]["w"]
      curHeight := pageSizes[i]["/MediaBox"]["h"]
      angle := rotation.Int % 360
      // Normalize angle
      tpl := getTemplateID(imp, pdf, i, "/MediaBox")
      if angle != 0 && (angle/90)%2 != 0 {
          pdf.AddPageFormat(gofpdf.OrientationLandscape, gofpdf.SizeType{Wd: curWidth, Ht: curHeight})
          pdf.UseImportedTemplate(imp.UseTemplate(tpl, 0, 0, curHeight, curWidth))
      } else {
          pdf.AddPage()
          pdf.UseImportedTemplate(imp.UseTemplate(tpl, 0, 0, curWidth, curHeight))
      }
  }
  return pdf
}

//github.com/phpdave11/gofpdi 
func getTemplateID(i *gofpdi.Importer, f *gofpdf.Fpdf, pageno int, box string) int {
    tp := i.ImportPage(pageno, box)
    tplObjIDs := i.PutFormXobjectsUnordered()
    f.ImportTemplates(tplObjIDs)
    imported := i.GetImportedObjectsUnordered()
    f.ImportObjects(imported)
    importedObjPos := i.GetImportedObjHashPos()
    f.ImportObjPos(importedObjPos)
    return tp
}

// test on window7 \centos7.5 go 1.16. pdf text is chinese language before importing(sample pdf page 2) image

changed to english after importing

QQ截图20210527100919

I dont know the reason ....

Simple Pdf is here ↓↓↓↓↓↓↓↓↓

BODYL-P1.pdf

kaiceo commented 3 years ago

It seems the reason about CID font not be supported Further more, i found DecodeParams in the pdf source code seems not be supported on reader.go <</DecodeParms<</Columns 5/Predictor 12>

kaiceo commented 3 years ago

use mupdf(mutool) clean pdf before importing, then ok

ericzhao007 commented 2 years ago

pdfReader.GetPageRotation undefined