GoLang Binding of HyperScan https://www.hyperscan.io/
example for the following regexes #22

ghost commented 4 years ago


Hope you are all well !

I was wondering if you could provide me an example how to regex these patterns with gohs.

    bitcoinPatternRegexp, err := regexp.Compile(`[13][a-km-zA-HJ-NP-Z0-9]{26,33}$`)
    if err != nil {

    emailPatternRegexp, err := regexp.Compile(`([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$`)
    if err != nil {

    // (?:https?://)?(?:www)?(\S*?\.onion)\b
    onionPatternRegexp, err := regexp.Compile(`(?:https?\:\/\/)?[\w\-\.]+\.onion`)
    if err != nil {

    twitterPatternRegexp, err := regexp.Compile(`(https?\:)?(//)(www[\.])?(twitter.com/)([a-zA-Z0-9_]{1,15})[\/]?`)
    if err != nil {

we are working on an open source tor crawler and we start to have lots of regexes to implement. We could not figure out how to do it with gohs, sorry for that.

Can you gives us a snippet ?

Thanks in advance for any insights or inputs on that topic.

Cheers, X

ghost commented 4 years ago

I made some tests with the following script and hyperscan 5.3

package main

import (

    log "github.com/sirupsen/logrus"

// with sync for resource lock
type scratch struct {
    s *hyperscan.Scratch

var (
    Version  string
    Debug    bool
    Port     int
    FilePath string
    Flag     string
    Scratch  scratch
    Db       hyperscan.BlockDatabase
    Uptime   time.Time
    RegexMap map[int]RegexLine

type Response struct {
    Errno int         `json:errno`
    Msg   string      `json:msg`
    Data  interface{} `json:data`

type MatchResp struct {
    Id         int       `json:id`
    From       int       `json:from`
    To         int       `json:to`
    Flags      int       `json:flags`
    Context    string    `json:context`
    RegexLinev RegexLine `json:regexline`

type RegexLine struct {
    Expr string
    Data string

func main() {
    Version = "0.0.1"
    var rootCmd = &cobra.Command{
        Use:     "gohs-ladon",
        Short:   fmt.Sprintf("Gohs-ladon Service %s", Version),
        Run:     run,
        PreRunE: preRunE,
    rootCmd.Flags().Bool("debug", false, "Enable debug mode")
    rootCmd.Flags().Int("port", 8080, "Listen port")
    rootCmd.Flags().String("filepath", "", "Dict file path")
    rootCmd.Flags().String("flag", "iou", "Regex Flag")

    viper.BindPFlag("debug", rootCmd.Flags().Lookup("debug"))
    viper.BindPFlag("port", rootCmd.Flags().Lookup("port"))
    viper.BindPFlag("filepath", rootCmd.Flags().Lookup("filepath"))
    viper.BindPFlag("flag", rootCmd.Flags().Lookup("flag"))


func run(cmd *cobra.Command, args []string) {
    // Todo add a goroutine to check if pattern file changed, and reload file.

    // start web service
    http.Handle("/", middleware(http.HandlerFunc(matchHandle)))
    http.Handle("/_stats", middleware(http.HandlerFunc(statsHandle)))

    addr := fmt.Sprintf("", Port)
    s := &http.Server{
        Addr:         addr,
        ReadTimeout:  1 * time.Second,
        WriteTimeout: 1 * time.Second,
    Uptime = time.Now()

    fmt.Printf("[%s] gohs-ladon %s Running on %s\n", Uptime.Format(time.RFC3339), Version, addr)
    if err := s.ListenAndServe(); err != nil {


func preRunE(cmd *cobra.Command, args []string) error {
    Debug = viper.GetBool("debug")
    Port = viper.GetInt("port")
    FilePath = viper.GetString("filepath")
    Flag = viper.GetString("flag")

    if FilePath == "" {
        return fmt.Errorf("empty regex filepath")
    if Debug {
    } else {
    log.Debug("Prerun", args)
    RegexMap = make(map[int]RegexLine)
    err := buildScratch(FilePath)
    return err

// build scratch for regex file.
func buildScratch(filepath string) (err error) {
    file, err := os.Open(filepath)
    if err != nil {
        return err
    defer file.Close()

    patterns := []*hyperscan.Pattern{}
    var expr hyperscan.Expression
    var id int
    //flags := Flag
    //flags := hyperscan.Caseless | hyperscan.Utf8Mode
    flags, err := hyperscan.ParseCompileFlag(Flag)
    if err != nil {
        return err

    scanner := bufio.NewScanner(file)
    for scanner.Scan() {
        line := scanner.Text()
        // line start with #, skip
        if strings.HasPrefix(strings.TrimSpace(line), "#") {
            log.Info(fmt.Sprintf("line start with #, skip line: %s", line))
        s := strings.Split(line, "\t")
        // length less than 3, skip
        if len(s) < 3 {
            log.Info(fmt.Sprintf("line length less than 3, skip line: %s", line))
        id, err = strconv.Atoi(s[0])
        if err != nil {
            return fmt.Errorf("Atoi error.")
        expr = hyperscan.Expression(s[1])
        data := s[2]
        pattern := &hyperscan.Pattern{Expression: expr, Flags: flags, Id: id}
        patterns = append(patterns, pattern)
        RegexMap[id] = RegexLine{string(expr), data}
    if len(patterns) <= 0 {
        return fmt.Errorf("Empty regex")
    log.Info(fmt.Sprintf("regex file line number: %d", len(patterns)))
    log.Info("Start Building, please wait...")
    db, err := hyperscan.NewBlockDatabase(patterns...)
    if err != nil {
        return err

    Db = db
    scratch, err := hyperscan.NewScratch(Db)
    if err != nil {
        return err
    Scratch.s = scratch

    if err := scanner.Err(); err != nil {
        return err
    return nil

func middleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        next.ServeHTTP(w, r)
        end := time.Now()
        latency := end.Sub(start)
        host, _, _ := net.SplitHostPort(r.RemoteAddr)
            "remote_addr":    host,
            "latency":        latency,
            "content_length": r.ContentLength,
        }).Info(fmt.Sprintf("%s %s", r.Method, r.RequestURI))

func matchHandle(w http.ResponseWriter, r *http.Request) {
    query := r.FormValue("q")
    var resp Response = Response{Errno: 0}
    w.Header().Set("Content-Type", "application/json")
    if query == "" {
        resp.Errno = -1
        resp.Msg = "empty param q"
    } else {
        inputData := []byte(query)
        // results
        var matchResps []MatchResp
        eventHandler := func(id uint, from, to uint64, flags uint, context interface{}) error {
            log.Info(fmt.Sprintf("id: %d, from: %d, to: %d, flags: %v, context: %s", id, from, to, flags, context))
            regexLine, ok := RegexMap[int(id)]
            if !ok {
                regexLine = RegexLine{}
            matchResp := MatchResp{Id: int(id), From: int(from), To: int(to), Flags: int(flags), Context: fmt.Sprintf("%s", context), RegexLinev: regexLine}
            matchResps = append(matchResps, matchResp)
            return nil

        // lock scratch
        if err := Db.Scan(inputData, Scratch.s, eventHandler, inputData); err != nil {
            logFields := log.Fields{"query": query}
            resp.Errno = -2
            resp.Msg = fmt.Sprintf("Db.Scan error: %s", err)
        } else {
            if len(matchResps) <= 0 {
                resp.Errno = 1
                resp.Msg = "no match"
            resp.Data = matchResps
        // unlock scratch

func statsHandle(w http.ResponseWriter, r *http.Request) {
    io.WriteString(w, fmt.Sprintf("gohs-ladon %v, Uptime %v",
        Version, Uptime.Format(time.RFC3339)))

I defined a pattern file like this:

1   [13][a-km-zA-HJ-NP-Z0-9]{26,33}$    bitcoin-wallet
2   ([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$   email-address
3   (?:https?://)?(?:www)?(\S*?\.onion)\b   onion-domain
4   (https?\:)?(//)(www[\.])?(twitter.com/)([a-zA-Z0-9_]{1,15})[\/]?    twitter-account

using the following query:

random test sentence

And it s returning only the last result in the data attribute.

Is there something that I am doing wrong ? Is it possible to have multiple matches from the dict ?

ghost commented 4 years ago

@flier Any idea why I have only the latest result instead of all ? Is it due to hyperscan version ?

flier commented 4 years ago

@x0rzkov Sorry, I'm not quite sure what you expected the outcome to be.

For your example, the default flag iou means the regex will match only one time and case insensitive, so, hyperscan will stop the scan as soon as it finds a match.

  i   Caseless
  o   SingleMatch
  u   Utf8Mode

Remove the SingleMatch flag if you want to get multiple matches in a piece of text, or you can use the VectoredScanner interface to scan multiple text segments at the same time.

ghost commented 4 years ago

@flier the expected behaviour was to substr with from/to values all the matched pattern from the dictionary. I removed singlematch but I still have the weird behaviour from https://github.com/flier/gohs/issues/24.

If you help me to solve that, I am saved :-)

flier commented 4 years ago

@x0rzkov Please check my comment and example, hyperscan does have some very strange designs for performance. :)

flier commented 4 years ago

Besides, you don't need a lock to access Scratch, that's a very lightweight data structure that can be allocated in each context.