serhack / pdf-diff

A tool for visualizing differences between two pdf files.
MIT License
819 stars 41 forks source link

replace poppler with inline code #11

Open gedw99 opened 2 years ago

gedw99 commented 2 years ago

We can replace https://github.com/serhack/pdf-diff/blob/5535f71841530a024fd0b8b1f823de03df9a0fc1/main.go#L48 with this if we want. ....

package main

import (
    "flag"
    "fmt"
    "image/jpeg"
    "os"
    "path/filepath"

    "github.com/gen2brain/go-fitz"
)

func main() {
    // flags for source dir (pdf) and output dir (png,etc)

    sourceFile := flag.String("source", ".", "source file pdf")
    targetDir := flag.String("target", ".", "target dir")

    flag.Parse()

    fmt.Println("sourceFile:", *sourceFile)
    fmt.Println("targetDir:", *targetDir)

    doc, err := fitz.New(*sourceFile)
    if err != nil {
        panic(err)
    }

    defer doc.Close()
    /*
        // output to runtime dir
        currentDir, err := os.Getwd()
        //tmpDir, err := ioutil.TempDir(os.TempDir(), "fitz")
        if err != nil {
            panic(err)
        }

        // concat out dir
        tmpDir := filepath.Join(currentDir, "out")
        err = os.MkdirAll(tmpDir, os.ModePerm)
    */
    err = os.MkdirAll(*targetDir, os.ModePerm)

    if err != nil {
        panic(err)
    }

    // Extract pages as images
    for n := 0; n < doc.NumPage(); n++ {
        img, err := doc.Image(n)
        if err != nil {
            panic(err)
        }

        f, err := os.Create(filepath.Join(*targetDir, fmt.Sprintf("test%03d.jpg", n)))
        if err != nil {
            panic(err)
        }

        err = jpeg.Encode(f, img, &jpeg.Options{jpeg.DefaultQuality})
        if err != nil {
            panic(err)
        }

        f.Close()
    }

    // Extract pages as text
    for n := 0; n < doc.NumPage(); n++ {
        text, err := doc.Text(n)
        if err != nil {
            panic(err)
        }

        f, err := os.Create(filepath.Join(*targetDir, fmt.Sprintf("test%03d.txt", n)))
        if err != nil {
            panic(err)
        }

        _, err = f.WriteString(text)
        if err != nil {
            panic(err)
        }

        f.Close()
    }

    // Extract pages as html
    for n := 0; n < doc.NumPage(); n++ {
        html, err := doc.HTML(n, true)
        if err != nil {
            panic(err)
        }

        f, err := os.Create(filepath.Join(*targetDir, fmt.Sprintf("test%03d.html", n)))
        if err != nil {
            panic(err)
        }

        _, err = f.WriteString(html)
        if err != nil {
            panic(err)
        }

        f.Close()
    }
}

This will build for all OS because the libs are included for all os at https://github.com/gen2brain/go-fitz/tree/master/libs

works for me on Mac. Maybe test on windows, and linux.

it would replace poppler which is very heavy IMHO and make the golang binary fully contained to a single file.