ledongthuc / pdf

PDF reader
BSD 3-Clause "New" or "Revised" License
431 stars 150 forks source link
golang pdf pdf-viewer

PDF Reader

Built with WeBuild

A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf

Features

Install:

go get -u github.com/ledongthuc/pdf

Read plain text

package main

import (
    "bytes"
    "fmt"

    "github.com/ledongthuc/pdf"
)

func main() {
    pdf.DebugOn = true
    content, err := readPdf("test.pdf") // Read local pdf file
    if err != nil {
        panic(err)
    }
    fmt.Println(content)
    return
}

func readPdf(path string) (string, error) {
    f, r, err := pdf.Open(path)
    // remember close file
    defer f.Close()
    if err != nil {
        return "", err
    }
    var buf bytes.Buffer
    b, err := r.GetPlainText()
    if err != nil {
        return "", err
    }
    buf.ReadFrom(b)
    return buf.String(), nil
}

Read all text with styles from PDF

func readPdf2(path string) (string, error) {
    f, r, err := pdf.Open(path)
    // remember close file
    defer f.Close()
    if err != nil {
        return "", err
    }
    totalPage := r.NumPage()

    for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
        p := r.Page(pageIndex)
        if p.V.IsNull() {
            continue
        }
        var lastTextStyle pdf.Text
        texts := p.Content().Text
        for _, text := range texts {
            if isSameSentence(text, lastTextStyle) {
                lastTextStyle.S = lastTextStyle.S + text.S
            } else {
                fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
                lastTextStyle = text
            }
        }
    }
    return "", nil
}

Read text grouped by rows

package main

import (
    "fmt"
    "os"

    "github.com/ledongthuc/pdf"
)

func main() {
    content, err := readPdf(os.Args[1]) // Read local pdf file
    if err != nil {
        panic(err)
    }
    fmt.Println(content)
    return
}

func readPdf(path string) (string, error) {
    f, r, err := pdf.Open(path)
    defer func() {
        _ = f.Close()
    }()
    if err != nil {
        return "", err
    }
    totalPage := r.NumPage()

    for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
        p := r.Page(pageIndex)
        if p.V.IsNull() {
            continue
        }

        rows, _ := p.GetTextByRow()
        for _, row := range rows {
            println(">>>> row: ", row.Position)
            for _, word := range row.Content {
                fmt.Println(word.S)
            }
        }
    }
    return "", nil
}

Demo

Run example