pemistahl / lingua-go

The most accurate natural language detection library for Go, suitable for short text and mixed-language text
Apache License 2.0
1.18k stars 66 forks source link

Panics at loadJson #2

Closed dsxack closed 3 years ago

dsxack commented 3 years ago

Code to reproduce:

package main

import (
    "fmt"
    "github.com/pemistahl/lingua-go"
)

func main() {
    languages := []lingua.Language{
        lingua.English,
        lingua.French,
        lingua.German,
        lingua.Spanish,
    }

    detector := lingua.NewLanguageDetectorBuilder().
        FromLanguages(languages...).
        Build()

    confidenceValues := detector.ComputeLanguageConfidenceValues("languages are awesome")

    for _, elem := range confidenceValues {
        fmt.Printf("%s: %.2f\n", elem.Language(), elem.Value())
    }

    // Output:
    // English: 1.00
    // French: 0.79
    // German: 0.75
    // Spanish: 0.72
}

go.mod

module lingua

go 1.16

require github.com/pemistahl/lingua-go v1.0.0

go env:

❯ go env
GO111MODULE="on"
GOARCH="amd64"
GOBIN=""
GOCACHE="/Users/dmitriysmotrov/Library/Caches/go-build"
GOENV="/Users/dmitriysmotrov/Library/Application Support/go/env"
GOEXE=""
GOFLAGS=""
GOHOSTARCH="amd64"
GOHOSTOS="darwin"
GOINSECURE=""
GOMODCACHE="/Users/dmitriysmotrov/.gvm/pkgsets/go1.16.5/global/pkg/mod"
GONOPROXY=""
GONOSUMDB=""
GOOS="darwin"
GOPATH="/Users/dmitriysmotrov/.gvm/pkgsets/go1.16.5/global"
GOPRIVATE=""
GOPROXY="https://proxy.golang.org,direct"
GOROOT="/Users/dmitriysmotrov/.gvm/gos/go1.16.5"
GOSUMDB="sum.golang.org"
GOTMPDIR=""
GOTOOLDIR="/Users/dmitriysmotrov/.gvm/gos/go1.16.5/pkg/tool/darwin_amd64"
GOVCS=""
GOVERSION="go1.16.5"
GCCGO="gccgo"
AR="ar"
CC="clang"
CXX="clang++"
CGO_ENABLED="1"
GOMOD="/Users/dmitriysmotrov/space/dsxack/lingua/go.mod"
CGO_CFLAGS="-g -O2"
CGO_CPPFLAGS=""
CGO_CXXFLAGS="-g -O2"
CGO_FFLAGS="-g -O2"
CGO_LDFLAGS="-g -O2"
PKG_CONFIG="pkg-config"
GOGCCFLAGS="-fPIC -arch x86_64 -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -fdebug-prefix-map=/var/folders/z5/8ts06jv92yjc5sp5mdsdzr2h0000gn/T/go-build2817996487=/tmp/go-build -gno-record-gcc-switches -fno-common"

Expect: no panics

Actual:

panic: runtime error: invalid memory address or nil pointer dereference
    panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x10e9c82]

goroutine 22 [running]:
archive/zip.(*ReadCloser).Close(0x0, 0x0, 0x0)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/archive/zip/reader.go:161 +0x22
panic(0x11841e0, 0x12c0160)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/runtime/panic.go:965 +0x1b9
github.com/pemistahl/lingua-go.loadJson(0x18, 0x5, 0x0, 0x0, 0x0)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/json.go:32 +0x18e
github.com/pemistahl/lingua-go.loadFivegrams(...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/fivegrams.go:925
github.com/pemistahl/lingua-go.germanFivegramModel.func1.1()
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/fivegrams.go:368 +0x45
sync.(*Once).doSlow(0xc0000b0f30, 0xc000094b68)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/sync/once.go:68 +0xec
sync.(*Once).Do(...)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/sync/once.go:59
github.com/pemistahl/lingua-go.germanFivegramModel.func1(0x11831c0, 0xc00009afc0)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/fivegrams.go:367 +0xbb
github.com/pemistahl/lingua-go.languageDetector.lookUpNgramProbability(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:530 +0x1cb
github.com/pemistahl/lingua-go.languageDetector.computeSumOfNgramProbabilities(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:516 +0xf7
github.com/pemistahl/lingua-go.languageDetector.computeLanguageProbabilities(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:474 +0xca
github.com/pemistahl/lingua-go.languageDetector.lookUpLanguageModels(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:442 +0xca
created by github.com/pemistahl/lingua-go.languageDetector.ComputeLanguageConfidenceValues
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:170 +0x525
panic: runtime error: invalid memory address or nil pointer dereference
    panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x10e9c82]

goroutine 18 [running]:
archive/zip.(*ReadCloser).Close(0x0, 0x0, 0x0)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/archive/zip/reader.go:161 +0x22
panic(0x11841e0, 0x12c0160)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/runtime/panic.go:965 +0x1b9
github.com/pemistahl/lingua-go.loadJson(0x18, 0x1, 0x0, 0x0, 0x0)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/json.go:32 +0x18e
github.com/pemistahl/lingua-go.loadUnigrams(...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/unigrams.go:925
github.com/pemistahl/lingua-go.germanUnigramModel.func1.1()
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/unigrams.go:368 +0x45
sync.(*Once).doSlow(0xc0000b1d40, 0xc000064b68)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/sync/once.go:68 +0xec
sync.(*Once).Do(...)
    /Users/dmitriysmotrov/.gvm/gos/go1.16.5/src/sync/once.go:59
github.com/pemistahl/lingua-go.germanUnigramModel.func1(0x11831c0, 0xc00009b050)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/unigrams.go:367 +0xbb
github.com/pemistahl/lingua-go.languageDetector.lookUpNgramProbability(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:538 +0x128
github.com/pemistahl/lingua-go.languageDetector.computeSumOfNgramProbabilities(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:516 +0xf7
github.com/pemistahl/lingua-go.languageDetector.computeLanguageProbabilities(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:474 +0xca
github.com/pemistahl/lingua-go.languageDetector.lookUpLanguageModels(0xc0000d49c0, 0x4, 0x4, 0x0, 0xc00012c080, 0x2, 0x2, 0xc00009b0b0, 0xc00009b050, 0xc00009af90, ...)
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:442 +0xca
created by github.com/pemistahl/lingua-go.languageDetector.ComputeLanguageConfidenceValues
    /Users/dmitriysmotrov/.gvm/pkgsets/go1.16rc1/global/pkg/mod/github.com/pemistahl/lingua-go@v1.0.0/detector.go:170 +0x525
pemistahl commented 3 years ago

Hi @dsxack, thank you for this report. Unfortunately, I'm not able to reproduce the panic on my development machine running macOS 10.15. Does it occur every time you run the program or just occasionally? Can you give me any other hint about how to reproduce it?

For what it's worth, here is the output of my go env:

GO111MODULE="on"
GOARCH="amd64"
GOBIN=""
GOCACHE="/Users/pemistahl/Library/Caches/go-build"
GOENV="/Users/pemistahl/Library/Application Support/go/env"
GOEXE=""
GOFLAGS=""
GOHOSTARCH="amd64"
GOHOSTOS="darwin"
GOINSECURE=""
GOMODCACHE="/Users/pemistahl/go/pkg/mod"
GONOPROXY=""
GONOSUMDB=""
GOOS="darwin"
GOPATH="/Users/pemistahl/go"
GOPRIVATE=""
GOPROXY="https://proxy.golang.org,direct"
GOROOT="/usr/local/Cellar/go/1.16.5/libexec"
GOSUMDB="sum.golang.org"
GOTMPDIR=""
GOTOOLDIR="/usr/local/Cellar/go/1.16.5/libexec/pkg/tool/darwin_amd64"
GOVCS=""
GOVERSION="go1.16.5"
GCCGO="gccgo"
AR="ar"
CC="clang"
CXX="clang++"
CGO_ENABLED="1"
GOMOD="/Users/pemistahl/Documents/git-repositories/lingua-go/go.mod"
CGO_CFLAGS="-g -O2"
CGO_CPPFLAGS=""
CGO_CXXFLAGS="-g -O2"
CGO_FFLAGS="-g -O2"
CGO_LDFLAGS="-g -O2"
PKG_CONFIG="pkg-config"
GOGCCFLAGS="-fPIC -arch x86_64 -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -fdebug-prefix-map=/var/folders/ld/j3qxrhnn5nq2qk0yjplz2flc0000gn/T/go-build2115905117=/tmp/go-build -gno-record-gcc-switches -fno-common"
dsxack commented 3 years ago

Hi @pemistahl I've reproduced it in docker. I will write script and put it here.

dsxack commented 3 years ago

@pemistahl script to reproduce with docker:

#!/usr/bin/env bash

# debug bash commands
set -x

# make project dir
mkdir test && cd test;

# write main.go
cat <<EOF > main.go
package main

import (
    "fmt"
    "github.com/pemistahl/lingua-go"
)

func main() {
    languages := []lingua.Language{
        lingua.English,
        lingua.French,
        lingua.German,
        lingua.Spanish,
    }

    detector := lingua.NewLanguageDetectorBuilder().
        FromLanguages(languages...).
        Build()

    confidenceValues := detector.ComputeLanguageConfidenceValues("languages are awesome")

    for _, elem := range confidenceValues {
        fmt.Printf("%s: %.2f\n", elem.Language(), elem.Value())
    }

    // Output:
    // English: 1.00
    // French: 0.79
    // German: 0.75
    // Spanish: 0.72
}
EOF

# run docker container, init go mod, get dependencies and run.
docker run --rm -it -v $(pwd):/app golang bash -c 'set -x; \
  cd /app \
  && go mod init testing \
  && go get github.com/pemistahl/lingua-go \
  && go run main.go'
dsxack commented 3 years ago

I think panic happens because you use relative path from current working dir here:

func loadJson(language Language, ngramLength int) []byte {
    ngramName := getNgramNameByLength(ngramLength)
    isoCode := strings.ToLower(language.IsoCode639_1().String())
    zipFilePath := fmt.Sprintf("language-models/%s/%ss.json.zip", isoCode, ngramName) # <=== path that relative from working dir here, not relative module sources
    zipFile, _ := zip.OpenReader(zipFilePath)
    defer zipFile.Close()
    jsonFile, _ := zipFile.File[0].Open()
    defer jsonFile.Close()
    jsonFileContent, _ := io.ReadAll(jsonFile)
    return jsonFileContent
}
dsxack commented 3 years ago

I know three options to use resources in go modules:

  1. Use runtime _, path, _, _ := runtime.Caller(0). I think it variant is bad because resources will not be available from compiled binary. This problem also applies to the current code.
  2. Use can pack sources by go:embed but it requires go 1.16. It is ok because your library already requires go 1.16
  3. Put resources into *.go sources into variables. You can use go:generate for it, but it is worse than go:embed
dsxack commented 3 years ago

I can prepare MR for both of variants if you want help)

dsxack commented 3 years ago

@pemistahl

Does it occur every time you run the program or just occasionally?

It occurs every time. But if I add lingua.Russian and put phrase языки это круто into detector.ComputeLanguageConfidenceValues it works without panic and prints:

Russian: 1.00