simsong / bulk_extractor

This is the development tree. Production downloads are at:
https://github.com/simsong/bulk_extractor/releases
Other
1.09k stars 187 forks source link

Rewrite scan_base16.flex for optimization #247

Open jonstewart opened 3 years ago

jonstewart commented 3 years ago

scan_base16.flex is a good candidate for a hard-coded implementation of a bit-parallel automaton. The matching alphabet is relatively small and repeated, and the number of states should be small. Such an implementation can likely identify and decode hex data faster than a general-purpose regexp library, or Flex.

This should only be done if timing data indicates that scan_base16.flex has significant runtime overhead compared to other scanners. It could happen post-2.0.

simsong commented 3 years ago

Here is the flex-generated NFA:

#define YY_NUM_RULES 3
#define YY_END_OF_BUFFER 4
/* This struct is not used in this scanner,
   but its presence is necessary. */
struct yy_trans_info
        {
        flex_int32_t yy_verify;
        flex_int32_t yy_nxt;
        };
static yyconst flex_int16_t yy_accept[44] =
    {   0,
        0,    0,    4,    2,    2,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    1,    0,    0,    0,    0,
        0,    1,    0
    } ;

static yyconst flex_int32_t yy_ec[256] =
    {   0,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    2,
        1,    1,    3,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    4,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    5,    5,    5,
        5,    5,    5,    5,    5,    5,    5,    1,    1,    1,
        1,    1,    1,    1,    5,    5,    5,    5,    5,    5,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    5,    5,    5,    5,

        5,    5,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,

        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
        1,    1,    1,    1,    1
    } ;

static yyconst flex_int32_t yy_meta[6] =
    {   0,
        1,    2,    1,    1,    3
    } ;

static yyconst flex_int16_t yy_base[70] =
    {   0,
       75,   74,   78,   81,    0,    0,    4,    0,    0,    0,
        0,    8,   12,    0,    0,    0,    0,   16,   20,    0,
        0,    0,    0,   24,   28,    0,    0,    0,    0,   32,
       36,    0,    0,    0,    0,   40,   44,    0,    0,    0,
        0,    0,   81,   49,   74,   74,   72,   71,   71,   70,
       68,   67,   67,   66,   64,   63,   63,   62,   60,   59,
       59,   58,   56,   55,   55,   54,   52,   51,   51
    } ;

static yyconst flex_int16_t yy_def[70] =
    {   0,
       44,   44,   43,   43,   45,   43,   43,   46,   47,   48,
       49,   43,   43,   50,   51,   52,   53,   43,   43,   54,
       55,   56,   57,   43,   43,   58,   59,   60,   61,   43,
       43,   62,   63,   64,   65,   43,   43,   66,   67,   68,
       69,   36,    0,   43,   43,   43,   43,   43,   43,   43,
       43,   43,   43,   43,   43,   43,   43,   43,   43,   43,
       43,   43,   43,   43,   43,   43,   43,   43,   43
    } ;

static yyconst flex_int16_t yy_nxt[87] =
    {   0,
       43,    7,    8,    7,    9,   10,   11,   10,    9,   13,
       14,   13,   15,   16,   17,   16,   15,   19,   20,   19,
       21,   22,   23,   22,   21,   25,   26,   25,   27,   28,
       29,   28,   27,   31,   32,   31,   33,   34,   35,   34,
       33,   37,   38,   37,   39,   40,   41,   40,   39,    4,
        4,    4,   40,   39,   42,   37,   34,   33,   36,   31,
       28,   27,   30,   25,   22,   21,   24,   19,   16,   15,
       18,   13,   10,    9,   12,    7,    6,   43,    5,    5,
        3,   43,   43,   43,   43,   43
    } ;

static yyconst flex_int16_t yy_chk[87] =
    {   0,
        0,    6,    6,    6,    6,    7,    7,    7,    7,   12,
       12,   12,   12,   13,   13,   13,   13,   18,   18,   18,
       18,   19,   19,   19,   19,   24,   24,   24,   24,   25,
       25,   25,   25,   30,   30,   30,   30,   31,   31,   31,
       31,   36,   36,   36,   36,   37,   37,   37,   37,   44,
       44,   44,   69,   68,   67,   66,   65,   64,   63,   62,
       61,   60,   59,   58,   57,   56,   55,   54,   53,   52,
       51,   50,   49,   48,   47,   46,   45,    3,    2,    1,
       43,   43,   43,   43,   43,   43
    } ;

That's pretty small, and I think that it's super-efficient. My feeling is to leave this as it is, as it works.

jonstewart commented 3 years ago

There are a whole class of algorithms that simulate automata in bit vectors, many of them coming out of bioinformatics (where the alphabet size is an appealing 4). This description of a “shift DFA” made the rounds a few months ago, but there are many similar techniques:

https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725

Joel and I built a very fast hex encoder a while back using AVX2. Writing a corresponding decoder would be a fun exercise sometime, but not otherwise warranted.

On Sep 12, 2021, at 4:26 PM, Simson L. Garfinkel @.***> wrote:

 Here is the flex-generated NFA:

define YY_NUM_RULES 3

define YY_END_OF_BUFFER 4

/ This struct is not used in this scanner, but its presence is necessary. / struct yy_trans_info { flex_int32_t yy_verify; flex_int32_t yy_nxt; }; static yyconst flex_int16_t yy_accept[44] = { 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0 } ;

static yyconst flex_int32_t yy_ec[256] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5,

    5,    5,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,

    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    1,    1,    1,    1,    1
} ;

static yyconst flex_int32_t yy_meta[6] = { 0, 1, 2, 1, 1, 3 } ;

static yyconst flex_int16_t yy_base[70] = { 0, 75, 74, 78, 81, 0, 0, 4, 0, 0, 0, 0, 8, 12, 0, 0, 0, 0, 16, 20, 0, 0, 0, 0, 24, 28, 0, 0, 0, 0, 32, 36, 0, 0, 0, 0, 40, 44, 0, 0, 0, 0, 0, 81, 49, 74, 74, 72, 71, 71, 70, 68, 67, 67, 66, 64, 63, 63, 62, 60, 59, 59, 58, 56, 55, 55, 54, 52, 51, 51 } ;

static yyconst flex_int16_t yy_def[70] = { 0, 44, 44, 43, 43, 45, 43, 43, 46, 47, 48, 49, 43, 43, 50, 51, 52, 53, 43, 43, 54, 55, 56, 57, 43, 43, 58, 59, 60, 61, 43, 43, 62, 63, 64, 65, 43, 43, 66, 67, 68, 69, 36, 0, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43 } ;

static yyconst flex_int16_t yy_nxt[87] = { 0, 43, 7, 8, 7, 9, 10, 11, 10, 9, 13, 14, 13, 15, 16, 17, 16, 15, 19, 20, 19, 21, 22, 23, 22, 21, 25, 26, 25, 27, 28, 29, 28, 27, 31, 32, 31, 33, 34, 35, 34, 33, 37, 38, 37, 39, 40, 41, 40, 39, 4, 4, 4, 40, 39, 42, 37, 34, 33, 36, 31, 28, 27, 30, 25, 22, 21, 24, 19, 16, 15, 18, 13, 10, 9, 12, 7, 6, 43, 5, 5, 3, 43, 43, 43, 43, 43 } ;

static yyconst flex_int16_t yy_chk[87] = { 0, 0, 6, 6, 6, 6, 7, 7, 7, 7, 12, 12, 12, 12, 13, 13, 13, 13, 18, 18, 18, 18, 19, 19, 19, 19, 24, 24, 24, 24, 25, 25, 25, 25, 30, 30, 30, 30, 31, 31, 31, 31, 36, 36, 36, 36, 37, 37, 37, 37, 44, 44, 44, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 3, 2, 1, 43, 43, 43, 43, 43, 43 } ; That's pretty small, and I think that it's super-efficient. My feeling is to leave this as it is, as it works.

— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub, or unsubscribe. Triage notifications on the go with GitHub Mobile for iOS or Android.

simsong commented 3 years ago

Huh. How about that. Big question: is there are lot of hex-encoded data out there? That's the real question.