darvid / python-hyperscan

šŸ A CPython extension for the Hyperscan regular expression matching library.
https://python-hyperscan.readthedocs.io/en/latest/
MIT License
165 stars 28 forks source link

More simple examples #22

Closed ghost closed 3 years ago

ghost commented 4 years ago

Hi,

Hope you are all well !

Can you provide more example for matching multiple patterns from a simple text ?

More other, I am gopher rather a pythonista but I am trying to learn python.

For now, in golang, I wrote the following script to create a database or patterns and to scan a text input.

patterns.txt

10  [$]\s?[+-]?[0-9]{1,3}(?:(?:,?[0-9]{3}))*(?:\.[0-9]{1,2})?    PricePattern          
11  (?:#?([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))    HexColorPattern       
12  (?:(?:(?:\d{4}[- ]?){3}\d{4}|\d{15,16}))     CreditCardPattern     
13  4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}  VISACreditCardPattern 
14  5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}     MCCreditCardPattern   
15  [13][a-km-zA-HJ-NP-Z1-9]{25,34}  BtcAddressPattern     
16  \d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?    StreetAddressPattern  
17  \b\d{5}(?:[-\s]\d{4})?\b     ZipCodePattern        
18  (?i)P\.? ?O\.? Box \d+   PoBoxPattern          
19  (?:\d{3}-\d{2}-\d{4})    SSNPattern            
20  [0-9a-fA-F]{32}  MD5HexPattern         
21  [0-9a-fA-F]{40}  SHA1HexPattern        
22  [0-9a-fA-F]{64}  SHA256HexPattern      
23  [0-9a-fA-F]{8}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{12}  GUIDPattern           
24  (?:[\d]-?){12}[\dxX]     ISBN13Pattern         
25  (?:[\d]-?){9}[\dxX]  ISBN10Pattern         
26  (([a-fA-F0-9]{2}[:-]){5}([a-fA-F0-9]{2}))    MACAddressPattern     
27  [A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z\d]?){0,16}    IBANPattern           
28  ((git|ssh|http(s)?)|(git@[\w\.]+))(:(\/\/)?)([\w\.@\:/\-~]+)(\.git)(\/)?     GitRepoPattern        
29  (?:https?://)?(?:www)?(\S*?\.onion)\b   OnionDomain

main.go

package main

import (
    "bufio"
    "encoding/json"
    "fmt"
    "io"
    "net"
    "net/http"
    "os"
    "strconv"
    "strings"
    "sync"
    "time"

    "github.com/flier/gohs/hyperscan"
    "github.com/k0kubun/pp"
    log "github.com/sirupsen/logrus"
    "github.com/spf13/cobra"
    "github.com/spf13/viper"
)

// with sync for resource lock
type scratch struct {
    sync.RWMutex
    s *hyperscan.Scratch
}

var (
    Version  string
    Debug    bool
    Port     int
    FilePath string
    Flag     string
    Scratch  scratch
    Db       hyperscan.BlockDatabase
    Uptime   time.Time
    RegexMap map[int]RegexLine
)

type Response struct {
    Errno int         `json:errno`
    Msg   string      `json:msg`
    Data  interface{} `json:data`
}

type MatchResp struct {
    Id         int       `json:id`
    From       int       `json:from`
    To         int       `json:to`
    Flags      int       `json:flags`
    Context    string    `json:context`
    RegexLinev RegexLine `json:regexline`
}

type RegexLine struct {
    Expr string
    Data string
}

func main() {
    Version = "0.0.1"
    viper.AutomaticEnv()
    var rootCmd = &cobra.Command{
        Use:     "gohs-ladon",
        Short:   fmt.Sprintf("Gohs-ladon Service %s", Version),
        Run:     run,
        PreRunE: preRunE,
    }
    rootCmd.Flags().Bool("debug", false, "Enable debug mode")
    rootCmd.Flags().Int("port", 8080, "Listen port")
    rootCmd.Flags().String("filepath", "", "Dict file path")
    rootCmd.Flags().String("flag", "ioum", "Regex Flag")

    viper.BindPFlag("debug", rootCmd.Flags().Lookup("debug"))
    viper.BindPFlag("port", rootCmd.Flags().Lookup("port"))
    viper.BindPFlag("filepath", rootCmd.Flags().Lookup("filepath"))
    viper.BindPFlag("flag", rootCmd.Flags().Lookup("flag"))

    rootCmd.Execute()
}

func run(cmd *cobra.Command, args []string) {
    // Todo add a goroutine to check if pattern file changed, and reload file.

    // start web service
    http.Handle("/", middleware(http.HandlerFunc(matchHandle)))
    http.Handle("/_stats", middleware(http.HandlerFunc(statsHandle)))

    addr := fmt.Sprintf("0.0.0.0:%d", Port)
    s := &http.Server{
        Addr:         addr,
        ReadTimeout:  1 * time.Second,
        WriteTimeout: 1 * time.Second,
    }
    Uptime = time.Now()

    fmt.Printf("[%s] gohs-ladon %s Running on %s\n", Uptime.Format(time.RFC3339), Version, addr)
    if err := s.ListenAndServe(); err != nil {
        log.Fatal(err)
    }

}

func preRunE(cmd *cobra.Command, args []string) error {
    Debug = viper.GetBool("debug")
    Port = viper.GetInt("port")
    FilePath = viper.GetString("filepath")
    Flag = viper.GetString("flag")

    if FilePath == "" {
        return fmt.Errorf("empty regex filepath")
    }
    if Debug {
        log.SetLevel(log.DebugLevel)
    } else {
        log.SetLevel(log.InfoLevel)
    }
    log.Debug("Prerun", args)
    RegexMap = make(map[int]RegexLine)
    err := buildScratch(FilePath)
    return err
}

// build scratch for regex file.
func buildScratch(filepath string) (err error) {
    file, err := os.Open(filepath)
    if err != nil {
        return err
    }
    defer file.Close()

    patterns := []*hyperscan.Pattern{}
    var expr hyperscan.Expression
    var id int
    // flags := Flag
    // flags := hyperscan.Caseless | hyperscan.Utf8Mode
    flags, err := hyperscan.ParseCompileFlag(Flag)
    if err != nil {
        return err
    }

    scanner := bufio.NewScanner(file)
    for scanner.Scan() {
        log.Debug(scanner.Text())
        line := scanner.Text()
        // line start with #, skip
        if strings.HasPrefix(strings.TrimSpace(line), "#") {
            log.Info(fmt.Sprintf("line start with #, skip line: %s", line))
            continue
        }
        s := strings.Split(line, "\t")
        // length less than 3, skip
        if len(s) < 3 {
            log.Info(fmt.Sprintf("line length less than 3, skip line: %s", line))
            continue
        }
        id, err = strconv.Atoi(s[0])
        if err != nil {
            return fmt.Errorf("Atoi error.")
        }
        expr = hyperscan.Expression(s[1])
        data := s[2]
        pattern := &hyperscan.Pattern{Expression: expr, Flags: flags, Id: id}
        patterns = append(patterns, pattern)
        RegexMap[id] = RegexLine{string(expr), data}
    }
    if len(patterns) <= 0 {
        return fmt.Errorf("Empty regex")
    }
    log.Info(fmt.Sprintf("regex file line number: %d", len(patterns)))
    log.Info("Start Building, please wait...")
    db, err := hyperscan.NewBlockDatabase(patterns...)
    if err != nil {
        return err
    }

    Db = db
    scratch, err := hyperscan.NewScratch(Db)
    if err != nil {
        return err
    }
    Scratch.s = scratch

    if err := scanner.Err(); err != nil {
        return err
    }
    return nil
}

func middleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        next.ServeHTTP(w, r)
        end := time.Now()
        latency := end.Sub(start)
        host, _, _ := net.SplitHostPort(r.RemoteAddr)
        log.WithFields(log.Fields{
            "remote_addr":    host,
            "latency":        latency,
            "content_length": r.ContentLength,
        }).Info(fmt.Sprintf("%s %s", r.Method, r.RequestURI))
    })
}

func matchHandle(w http.ResponseWriter, r *http.Request) {
    query := r.FormValue("q")
    var resp Response = Response{Errno: 0}
    w.Header().Set("Content-Type", "application/json")
    if query == "" {
        resp.Errno = -1
        resp.Msg = "empty param q"
    } else {
        inputData := []byte(`x0rzkov@protonmail.com 1HB5XMLmzFVj8ALj6mfBsbifRoD4miY36v https://twitter.com/x0rxkov random test sentence https://twitter.com/twitter

https://github.com/lucmichalski
https://github.com/lucmichalski/cars-dataset

git@github.com:fastai/fastai.git

`)
        // results
        var matchResps []MatchResp
        eventHandler := func(id uint, from, to uint64, flags uint, context interface{}) error {
            log.Info(fmt.Sprintf("id: %d, from: %d, to: %d, flags: %v, context: %s", id, from, to, flags, context))
            regexLine, ok := RegexMap[int(id)]
            if !ok {
                regexLine = RegexLine{}
            }
            matchResp := MatchResp{Id: int(id), From: int(from), To: int(to), Flags: int(flags), Context: fmt.Sprintf("%s", context), RegexLinev: regexLine}
            matchResps = append(matchResps, matchResp)
            return nil
        }
        pp.Println("matchResps:", matchResps)

        // lock scratch
        Scratch.Lock()
        if err := Db.Scan(inputData, Scratch.s, eventHandler, inputData); err != nil {
            logFields := log.Fields{"query": query}
            log.WithFields(logFields).Error(err)
            resp.Errno = -2
            resp.Msg = fmt.Sprintf("Db.Scan error: %s", err)
        } else {
            if len(matchResps) <= 0 {
                resp.Errno = 1
                resp.Msg = "no match"
            }
            resp.Data = matchResps
        }
        pp.Println("resp.Data:", resp.Data)
        // unlock scratch
        Scratch.Unlock()
    }
    json.NewEncoder(w).Encode(resp)
    w.WriteHeader(http.StatusOK)
}

func statsHandle(w http.ResponseWriter, r *http.Request) {
    w.WriteHeader(http.StatusOK)
    io.WriteString(w, fmt.Sprintf("gohs-ladon %v, Uptime %v",
        Version, Uptime.Format(time.RFC3339)))
}

Is it possible to create a small example, not restful if too complicated, for loading a text file of patterns and process a simple text input ?

Thanks in advance for any insights or inputs on that.

Cheers, X

darvid commented 4 years ago

here's a simple script that should hopefully help:

import sys

import hyperscan

def parse_patterns(filename):
    patterns = []
    with open(filename) as f:
        for line in f:
            if line.startswith('#'):
                continue
            columns = [chunk.strip() for chunk in line.split('\t')]
            if len(columns) != 3:
                continue
            columns[0] = int(columns[0])
            columns[1] = columns[1].encode()
            patterns.append(columns)
    return patterns

def match_handler(pattern_id, start, end, flags, context):
    pattern_name = context['names'][pattern_id]
    substr = context['input'][start:end]
    print(f'[match] {pattern_name}: `{substr}`')

def main(args):
    if len(args) != 2:
        print(f'usage: {sys.argv[0]} [patterns file] [input]')
        sys.exit(1)
    pattern_filename, input_str = args
    patterns = parse_patterns(pattern_filename)
    db = hyperscan.Database()
    pattern_ids, expressions, names = zip(*patterns)
    db.compile(expressions=expressions, ids=pattern_ids)
    context = {
        'input': input_str,
        'names': {
            pattern_id: names[i]
            for i, pattern_id in enumerate(pattern_ids)
        },
    }
    db.scan(
        input_str,
        match_event_handler=match_handler,
        context=context,
    )

if __name__ == '__main__':
    main(sys.argv[1:])

python-hyperscan on ī‚  master [$?] is šŸ“¦ v0.1.5 via šŸ v3.8.2rc2+ (hyperscan-nTr-HRfE-py3.8)
āÆ python hsmatch.py patterns.txt git://foo.git
[match] GitRepoPattern: `git://foo.git`

python-hyperscan on ī‚  master [$?] is šŸ“¦ v0.1.5 via šŸ v3.8.2rc2+ (hyperscan-nTr-HRfE-py3.8)
āÆ python hsmatch.py patterns.txt foo.onion
[match] OnionDomain: `foo.onion`

The unit tests should be another resource you can use, as they cover most of the Python interface.

ghost commented 4 years ago

Awesome, I ll try it now, keep u updated.

Thanks again

ghost commented 4 years ago

I tried with the following input

python main.py patterns.txt "foo.onion git://foo.git"

and it returned me

[match] OnionDomain: `foo.onion`
[match] GitRepoPattern: `foo.onion git://foo.git`

Is there an easy way to substr only the matched pattern ?

darvid commented 4 years ago

see Start of Match

replace the db.compile lines with this:

    db.compile(
        expressions=expressions,
        ids=pattern_ids,
        flags=hyperscan.HS_FLAG_SOM_LEFTMOST,
    )
ghost commented 4 years ago

It works better.

 python main.py patterns.txt "foo.onion git://foo.git"
[match] OnionDomain: `foo.onion`
[match] GitRepoPattern: `git://foo.git`

But I tried the following:

python main.py patterns.txt "x0rzkov@protonmail.com 1HB5XMLmzFVj8ALj6mfBsbifRoD4miY36v https://twitter.com/x0rxkov random test sentence https://twitter.com/twitter"

and got this output:

[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRo`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4m`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4mi`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4miY`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4miY3`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4miY36`
[match] BtcAddressPattern: `1HB5XMLmzFVj8ALj6mfBsbifRoD4miY36v`

Is my regex for bitcoin not properly written ?

Cheers, X

darvid commented 4 years ago

try adding a \b at the end of the btc pattern