Closed CdeMills closed 6 years ago
I have thought about doing something like this, although in the form of a file rather than a struct. It's not particularly straightforward because of the hashing and RB tree the lib uses internally.
You could embed the list of strings and then generate a string repository during initialization (in which strings are hashed to build the necessary internal structures), although this entails a copying of the strings which is probably something you're looking to avoid given the RAM constraints.
I think the best bet for your use-case would be perfect hashing. You could embed the list of strings in an array and then lookup strings by ID by indexing the array. You could then use a perfect hash function and another array to map strings to IDs.
As an example, if you were embedding three strings – january, february, march – a perfect hash function would be strlen
since each string has a different length:
#include <stdint.h>
#include <stdio.h>
#include <string.h>
const char *strings[] = {"january", "february", "march"};
const uint32_t ids[] = {0, 0, 0, 0, 0, 3, 0, 1, 2};
const char *lookup_id(uint32_t id) {
if (!id || id > sizeof(strings) / sizeof(*strings))
return NULL;
return strings[id - 1];
}
uint32_t lookup_str(const char *str) {
size_t len = strlen(str);
if (len >= sizeof(ids) / sizeof(*ids))
return 0;
uint32_t index = ids[len];
if (!index || strcmp(str, strings[index - 1]) != 0)
return 0;
return index;
}
int main() {
printf("%s\n", lookup_id(0)); // null (not found)
printf("%s\n", lookup_id(1)); // "january"
printf("%s\n", lookup_id(2)); // "february"
printf("%s\n", lookup_id(3)); // "march"
printf("%s\n", lookup_id(4)); // null (not found)
printf("%u\n", lookup_str("january")); // => 1
printf("%u\n", lookup_str("february")); // => 2
printf("%u\n", lookup_str("march")); // => 3
printf("%u\n", lookup_str("xyzzy")); // => 0 (not found)
}
You could use gperf to generate a perfect hash function for a larger list of strings, e.g.
$ cat months
january
february
march
april
may
june
july
august
september
october
november
december
$ gperf months
/* ANSI-C code produced by gperf version 3.1 */
/* Command-line: gperf months */
/* Computed positions: -k'1,3' */
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."
#endif
#define TOTAL_KEYWORDS 12
#define MIN_WORD_LENGTH 3
#define MAX_WORD_LENGTH 9
#define MIN_HASH_VALUE 3
#define MAX_HASH_VALUE 18
/* maximum key range = 16, duplicates = 0 */
#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
hash (register const char *str, register size_t len)
{
static unsigned char asso_values[] =
{
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 5, 5, 5,
0, 19, 5, 0, 19, 19, 0, 19, 10, 0,
0, 5, 0, 19, 0, 0, 0, 19, 0, 19,
19, 0, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19
};
return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]];
}
const char *
in_word_set (register const char *str, register size_t len)
{
static const char * wordlist[] =
{
"", "", "",
"may",
"june",
"march",
"",
"january",
"november",
"september",
"april",
"august",
"october",
"december",
"july",
"", "", "",
"february"
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
register unsigned int key = hash (str, len);
if (key <= MAX_HASH_VALUE)
{
register const char *s = wordlist[key];
if (*str == *s && !strcmp (str + 1, s + 1))
return s;
}
}
return 0;
}
Woaw ! I didn't know about gperf. A lot simpler indeed. Thanks a lot.
Pascal
Hello, I would like to use your framework to speedup code of a command interpreter on an embedded system. The specificity is that the amount of RAM is quite small, and that the set of strings is fixed.
The procedure would be; 1) on a development computer, take a list of strings and generate "something" which could be inserted into a C program. Basically as: struct ZZZ my_frozen_strings={{.val1 = XXX, .val2 = YYY}, ...}; 2) on the target platform, add this 'image' and incorporate only the code permitting to search one string in the set. No grow / pruning / snapshot
Where should I start with ?
Regards
Pascal