go-ego / gse

Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others.
Apache License 2.0
2.57k stars 215 forks source link
chinese english go gse hmm hmm-viterbi-algorithm japanese jieba nlp segment trie

gse

Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others. And supports with elasticsearch and bleve.

Build Status CircleCI Status codecov Build Status Go Report Card GoDoc GitHub release Join the chat at https://gitter.im/go-ego/ego

简体中文

Gse is implements jieba by golang, and try add NLP support and more feature

Feature:

Algorithm:

Text Segmentation speed:

Binding:

gse-bind, binding JavaScript and other, support more language.

Install / update

With Go module support (Go 1.11+), just import:

import "github.com/go-ego/gse"

Otherwise, to install the gse package, run the command:

go get -u github.com/go-ego/gse

Use

package main

import (
    _ "embed"
    "fmt"

    "github.com/go-ego/gse"
)

//go:embed testdata/test_en2.txt
var testDict string

//go:embed testdata/test_en.txt
var testEn string

var (
    text  = "To be or not to be, that's the question!"
    test1 = "Hiworld, Helloworld!"
)

func main() {
    var seg1 gse.Segmenter
    seg1.DictSep = ","
    err := seg1.LoadDict("./testdata/test_en.txt")
    if err != nil {
        fmt.Println("Load dictionary error: ", err)
    }

    s1 := seg1.Cut(text)
    fmt.Println("seg1 Cut: ", s1)
    // seg1 Cut:  [to be   or   not to be ,   that's the question!]

    var seg2 gse.Segmenter
    seg2.AlphaNum = true
    seg2.LoadDict("./testdata/test_en_dict3.txt")

    s2 := seg2.Cut(test1)
    fmt.Println("seg2 Cut: ", s2)
    // seg2 Cut:  [hi world ,   hello world !]

    var seg3 gse.Segmenter
    seg3.AlphaNum = true
    seg3.DictSep = ","
    err = seg3.LoadDictEmbed(testDict + "\n" + testEn)
    if err != nil {
        fmt.Println("loadDictEmbed error: ", err)
    }
    s3 := seg3.Cut(text + test1)
    fmt.Println("seg3 Cut: ", s3)
    // seg3 Cut:  [to be   or   not to be ,   that's the question! hi world ,   hello world !]

    // example2()
}

Example2:

package main

import (
    "fmt"
    "regexp"

    "github.com/go-ego/gse"
    "github.com/go-ego/gse/hmm/pos"
)

var (
    text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."

    new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")

    seg gse.Segmenter
    posSeg pos.Segmenter
)

func main() {
    // Loading the default dictionary
    seg.LoadDict()
    // Loading the default dictionary with embed
    // seg.LoadDictEmbed()
    //
    // Loading the Simplified Chinese dictionary
    // seg.LoadDict("zh_s")
    // seg.LoadDictEmbed("zh_s")
    //
    // Loading the Traditional Chinese dictionary
    // seg.LoadDict("zh_t")
    //
    // Loading the Japanese dictionary
    // seg.LoadDict("jp")
    //
    // Load the dictionary
    // seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")

    cut()

    segCut()
}

func cut() {
    hmm := new.Cut(text, true)
    fmt.Println("cut use hmm: ", hmm)

    hmm = new.CutSearch(text, true)
    fmt.Println("cut search use hmm: ", hmm)
    fmt.Println("analyze: ", new.Analyze(hmm, text))

    hmm = new.CutAll(text)
    fmt.Println("cut all: ", hmm)

    reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
    text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
    hmm = seg.CutDAG(text1, reg)
    fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])
}

func analyzeAndTrim(cut []string) {
    a := seg.Analyze(cut, "")
    fmt.Println("analyze the segment: ", a)

    cut = seg.Trim(cut)
    fmt.Println("cut all: ", cut)

    fmt.Println(seg.String(text, true))
    fmt.Println(seg.Slice(text, true))
}

func cutPos() {
    po := seg.Pos(text, true)
    fmt.Println("pos: ", po)
    po = seg.TrimPos(po)
    fmt.Println("trim pos: ", po)

    pos.WithGse(seg)
    po = posSeg.Cut(text, true)
    fmt.Println("pos: ", po)

    po = posSeg.TrimWithPos(po, "zg")
    fmt.Println("trim pos: ", po)
}

func segCut() {
    // Text Segmentation
    tb := []byte(text)
    fmt.Println(seg.String(text, true))

    segments := seg.Segment(tb)
    // Handle word segmentation results, search mode
    fmt.Println(gse.ToString(segments, true))
}

Look at an custom dictionary example

package main

import (
    "fmt"
    _ "embed"

    "github.com/go-ego/gse"
)

//go:embed test_en_dict3.txt
var testDict string

func main() {
    // var seg gse.Segmenter
    // seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
    // seg.LoadStop()
    seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
    // seg.LoadDictEmbed()
    seg.LoadStopEmbed()

    text1 := "Hello world, こんにちは世界, 你好世界!"
    s1 := seg.Cut(text1, true)
    fmt.Println(s1)
    fmt.Println("trim: ", seg.Trim(s1))
    fmt.Println("stop: ", seg.Stop(s1))
    fmt.Println(seg.String(text1, true))

    segments := seg.Segment([]byte(text1))
    fmt.Println(gse.ToString(segments))
}

Look at an Chinese example

Look at an Japanese example

Elasticsearch

How to use it with elasticsearch?

go-gse-elastic

Authors

License

Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". See LICENSE-APACHE, LICENSE-MIT.

Thanks for sego and jieba(jiebago).