chriso / intern

Fast, efficient string interning
MIT License
89 stars 6 forks source link

Consider generating a static "image" #2

Closed CdeMills closed 6 years ago

CdeMills commented 6 years ago

Hello, I would like to use your framework to speedup code of a command interpreter on an embedded system. The specificity is that the amount of RAM is quite small, and that the set of strings is fixed.

The procedure would be; 1) on a development computer, take a list of strings and generate "something" which could be inserted into a C program. Basically as: struct ZZZ my_frozen_strings={{.val1 = XXX, .val2 = YYY}, ...}; 2) on the target platform, add this 'image' and incorporate only the code permitting to search one string in the set. No grow / pruning / snapshot

Where should I start with ?

Regards

Pascal

chriso commented 6 years ago

I have thought about doing something like this, although in the form of a file rather than a struct. It's not particularly straightforward because of the hashing and RB tree the lib uses internally.

You could embed the list of strings and then generate a string repository during initialization (in which strings are hashed to build the necessary internal structures), although this entails a copying of the strings which is probably something you're looking to avoid given the RAM constraints.

I think the best bet for your use-case would be perfect hashing. You could embed the list of strings in an array and then lookup strings by ID by indexing the array. You could then use a perfect hash function and another array to map strings to IDs.

As an example, if you were embedding three strings – january, february, march – a perfect hash function would be strlen since each string has a different length:

#include <stdint.h>
#include <stdio.h>
#include <string.h>

const char *strings[] = {"january", "february", "march"};

const uint32_t ids[] = {0, 0, 0, 0, 0, 3, 0, 1, 2};

const char *lookup_id(uint32_t id) {
    if (!id || id > sizeof(strings) / sizeof(*strings))
        return NULL;
    return strings[id - 1];
}

uint32_t lookup_str(const char *str) {
    size_t len = strlen(str);
    if (len >= sizeof(ids) / sizeof(*ids))
        return 0;
    uint32_t index = ids[len];
    if (!index || strcmp(str, strings[index - 1]) != 0)
        return 0;
    return index;
}

int main() {
    printf("%s\n", lookup_id(0)); // null (not found)
    printf("%s\n", lookup_id(1)); // "january"
    printf("%s\n", lookup_id(2)); // "february"
    printf("%s\n", lookup_id(3)); // "march"
    printf("%s\n", lookup_id(4)); // null (not found)

    printf("%u\n", lookup_str("january")); // => 1
    printf("%u\n", lookup_str("february")); // => 2
    printf("%u\n", lookup_str("march")); // => 3
    printf("%u\n", lookup_str("xyzzy")); // => 0 (not found)
}

You could use gperf to generate a perfect hash function for a larger list of strings, e.g.

$ cat months 
january
february
march
april
may
june
july
august
september
october
november
december
$ gperf months 
/* ANSI-C code produced by gperf version 3.1 */
/* Command-line: gperf months  */
/* Computed positions: -k'1,3' */

#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
      && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
      && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
      && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
      && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
      && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
      && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
      && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
      && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
      && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
      && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
      && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
      && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
      && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
      && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
      && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
      && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
      && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
      && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
      && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
      && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
      && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
      && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646.  */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."
#endif

#define TOTAL_KEYWORDS 12
#define MIN_WORD_LENGTH 3
#define MAX_WORD_LENGTH 9
#define MIN_HASH_VALUE 3
#define MAX_HASH_VALUE 18
/* maximum key range = 16, duplicates = 0 */

#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
hash (register const char *str, register size_t len)
{
  static unsigned char asso_values[] =
    {
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19,  5,  5,  5,
       0, 19,  5,  0, 19, 19,  0, 19, 10,  0,
       0,  5,  0, 19,  0,  0,  0, 19,  0, 19,
      19,  0, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
      19, 19, 19, 19, 19, 19
    };
  return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]];
}

const char *
in_word_set (register const char *str, register size_t len)
{
  static const char * wordlist[] =
    {
      "", "", "",
      "may",
      "june",
      "march",
      "",
      "january",
      "november",
      "september",
      "april",
      "august",
      "october",
      "december",
      "july",
      "", "", "",
      "february"
    };

  if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
    {
      register unsigned int key = hash (str, len);

      if (key <= MAX_HASH_VALUE)
        {
          register const char *s = wordlist[key];

          if (*str == *s && !strcmp (str + 1, s + 1))
            return s;
        }
    }
  return 0;
}
CdeMills commented 6 years ago

Woaw ! I didn't know about gperf. A lot simpler indeed. Thanks a lot.

Pascal