intel / hyperscan

High-performance regular expression matching library
https://www.hyperscan.io
Other
4.71k stars 705 forks source link

Approximate match (edit distance and hamming distance) #412

Open Haigegege opened 10 months ago

Haigegege commented 10 months ago

Hi, I am studying about edit distance and hamming distance in my hyperscan env. but it's a little hard for me to understand how to find whitch one is the best matching. my example like this

type ScanContext struct {
    *bytes.Buffer
    data []byte
}

func main() {
    // distance = 3
    // one of pattern is like "tipsytemasagronomicos.com"
    patterns = append(patterns, hs.NewPattern(pattern, hs.Caseless, hs.EditDistance(uint32(distance))))
    ...

    // create hyperscan db
    hsdb, err := hs.NewBlockDatabase(patterns...)
    scratch, err := hs.NewScratch(*hsdb)
    ...

    buf := new(bytes.Buffer)
    input:= "13243412tipsytemasagr0n0mic0s.c0m"
    if err := (*d.HsDatabase).Scan([]byte(input), scratch, handleMatch, ScanContext{buf, []byte(input)}); err != nil {
            fmt.Println(err)
    }
    fmt.Printf("match: %s\n", buf.String())
    if len(buf.String()) > 0 {
        log.Println("Levenshtein distance match:", true)
    }
}

func handleMatch(id uint, from, to uint64, flags uint, data interface{}) error {
    ctx, _ := data.(ScanContext)
    fmt.Println("from:", from)
    fmt.Println("to:", to)
    fmt.Println("data:", string(ctx.data))
    if from < to {
        _, err := fmt.Fprintf(ctx.Buffer, "%s", ctx.data[from:to])
        if err != nil {
            return err
        }
    }

    return nil
}

The result is like that: image image

So, how can I do for next step, can you guys show me some example about the approximate match?