vovkos / jancy

The first and only scripting language with safe pointer arithmetics, high level of ABI and source compatibility with C, spreadsheet-like reactive programming, built-in lexer generator, and more.
http://jancy.org
MIT License
61 stars 9 forks source link

Missing C Runtime Functions #5

Open mingodad opened 3 years ago

mingodad commented 3 years ago

Trying to port lua regular expression patterns to jancy I needed the following missing C Runtime Functions that I think would benefit others:

isalpha
iscntrl
isdigit
isgraph
islower
ispunct
isspace
isupper
isalnum
isxdigit
mingodad commented 3 years ago

Also missing math functions and info about limits of library types defined in header <stdint.h>:

CHAR_BIT

number of bits in a byte 
(macro constant)

MB_LEN_MAX

maximum number of bytes in a multibyte character 
(macro constant)

CHAR_MIN

minimum value of char 
(macro constant)

CHAR_MAX

maximum value of char 
(macro constant)

SCHAR_MIN
SHRT_MIN
INT_MIN
LONG_MIN
LLONG_MIN

(C99)

minimum value of signed char, short, int, long and long long respectively 
(macro constant)

SCHAR_MAX
SHRT_MAX
INT_MAX
LONG_MAX
LLONG_MAX

(C99)

maximum value of signed char, short, int, long and long long respectively 
(macro constant)

UCHAR_MAX
USHRT_MAX
UINT_MAX
ULONG_MAX
ULLONG_MAX

(C99)

maximum value of unsigned char, unsigned short, unsigned int,
unsigned long and unsigned long long respectively 
(macro constant)
...
mingodad commented 3 years ago

My lua-regex.jnc so far (to show what I'm trying to achieve) that has this error:

jancy "lua-regex.jnc"
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(303,50): binary '+' cannot be applied to 'char [9]' and 'char*'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(182,48): unexpected 'identifier' in 'literal'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): undeclared identifier 'INT_MAX'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): unable to recover from previous error(s)
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(42,1172): binary '+' cannot be applied to 'char [2]' and 'char const*'
5 error(s); compilation failed

lua-regex.jnc

/*
** maximum number of captures that a pattern can do during
** pattern-matching. This limit is arbitrary.
*/

alias ptrdiff_t = intptr_t;

exposed enum LuaRegexConsts
{
    LUA_REGEX_MAXCAPTURES = 32,
    CAP_UNFINISHED = -1,
    CAP_POSITION = -2,
    L_ESC = '%',
}

struct LuaCapture {
    const char *init;
    intptr_t len;
};

struct LuaMatchState {
  const char *src_init;  /* init of source string */
  const char *src_end;  /* end ('\0') of source string */
  const char *p_end;  /* end ('\0') of pattern */
  size_t   start_pos;  /* pattern match start position */
  size_t   end_pos;    /* pattern match end position */
  const char *error;
  int level;  /* total number of captures (finished or unfinished) */
  LuaCapture capture[LUA_REGEX_MAXCAPTURES];
};

typedef int luaregex_func_param(LuaMatchState *ms, const void *udata, void **b);

/* macro to `unsign' a character */
//#define uchar(c)    ((unsigned char)(c))
static unsigned char uchar(char c) {return (unsigned char)(c);};

//static const char L_ESC = '%';
static const char SPECIALS[]  =  "^$*+?.([%-";

static char *LUA_QL(const char *x)  {return  "'" + x + "'"; }
static char *LUA_QS() {return LUA_QL("%s");}

static intptr_t posrelat (intptr_t pos, size_t len) {
  /* relative string position: negative means back from end */
  if (pos < 0) pos += len;
  return (pos >= 0) ? pos : 0;
}

static int check_capture_all_closed (LuaMatchState *ms) {
  int i;
  for(i=0; i<ms->level; ++i){
      if(ms->capture[i].len == CAP_UNFINISHED){
          ms->error = "unfinished capture";
          return 0;
      }
  }
  return 1;
}

static int check_capture_is_closed (LuaMatchState *ms, int l) {
  if (l < 0 || l >= ms->level){
      ms->error = "invalid capture index";
      return 0;
  }
  if (ms->capture[l].len == CAP_UNFINISHED){
      ms->error = "unfinished capture";
      return 0;
  }
  return 1;
}

static int check_capture (LuaMatchState *ms, int *l_out) {
  int l;
  *l_out -= '1';
  l = *l_out;
  return check_capture_is_closed(ms, l);
}

static int capture_to_close (LuaMatchState *ms, int *level_out) {
  int level = ms->level;
  for (level--; level>=0; level--)
    if (ms->capture[level].len == CAP_UNFINISHED) {
        *level_out = level;
        return 1;
    }
  ms->error = "invalid pattern capture";
  return 0;
}

static int classend (LuaMatchState *ms, const char *p, const char **result) {
  switch (*p++) {
    case L_ESC: {
      if (p == ms->p_end){
          ms->error = "malformed pattern (ends with " LUA_QL("%%") ")";
          return 0;
      }
      *result = p+1;
      return 1;
    }
    case '[': {
      if (*p == '^') p++;
      do {  /* look for a `]' */
        if (p == ms->p_end){
            ms->error = "malformed pattern (missing " LUA_QL("]") ")";
            return 0;
        }
        if (*(p++) == L_ESC && p < ms->p_end)
          p++;  /* skip escapes (e.g. `%]') */
      } while (*p != ']');
      *result = p+1;
      return 1;
    }
    default: {
      *result = p;
      return 1;
    }
  }
}

static int isalpha(int c) { return (c >= 'a' && c <= 'z') || (c <= 'A' && c >= 'Z');}

static int match_class (int c, int cl) {
  int res;
  switch (tolower(cl)) {
    case 'a' : res = isalpha(c); break;
    case 'c' : res = iscntrl(c); break;
    case 'd' : res = isdigit(c); break;
    case 'g' : res = isgraph(c); break;
    case 'l' : res = islower(c); break;
    case 'p' : res = ispunct(c); break;
    case 's' : res = isspace(c); break;
    case 'u' : res = isupper(c); break;
    case 'w' : res = isalnum(c); break;
    case 'x' : res = isxdigit(c); break;
    case 'z' : res = (c == 0); break;  /* deprecated option */
    default: return (cl == c);
  }
  return (islower(cl) ? res : !res);
}

static int matchbracketclass (int c, const char *p, const char *ec) {
  int sig = 1;
  if (*(p+1) == '^') {
    sig = 0;
    p++;  /* skip the `^' */
  }
  while (++p < ec) {
    if (*p == L_ESC) {
      p++;
      if (match_class(c, uchar(*p)))
        return sig;
    }
    else if ((*(p+1) == '-') && (p+2 < ec)) {
      p+=2;
      if (uchar(*(p-2)) <= c && c <= uchar(*p))
        return sig;
    }
    else if (uchar(*p) == c) return sig;
  }
  return !sig;
}

static int singlematch (int c, const char *p, const char *ep) {
  switch (*p) {
    case '.': return 1;  /* matches any char */
    case L_ESC: return match_class(c, uchar(*(p+1)));
    case '[': return matchbracketclass(c, p, ep-1);
    default:  return (uchar(*p) == c);
  }
}

//static const char *match (LuaMatchState *ms, const char *s, const char *p);

//add escape char extension from https://github.com/jcgoble3/lua-matchext
static const char *matchbalance (LuaMatchState *ms, const char *s,
                                   const char *p) {
  int escaped = (*(p-1) == 'B'); /* EXT */
  if (p >= ms->p_end - 1 - escaped){
    ms->error = "malformed pattern "
                      "(missing arguments to " LUA_QL("%%b") ")";
    return null;
  }
  if (*s != *p) return null;
  else {
    int b = *p;
    int e = *(p + (escaped ? 2 : 1));  /* EXT */
    int esc = escaped ? *(p + 1) : INT_MAX;  /* EXT */
    int cont = 1;
    while (++s < ms->src_end) {
      if (*s == esc) s++; /* EXT */
      else if (*s == e) {
        if (--cont == 0) return s+1;
      }
      else if (*s == b) cont++;
    }
  }
  return null;  /* string ends out of balance */
}

static const char *max_expand (LuaMatchState *ms, const char *s,
                                 const char *p, const char *ep) {
  ptrdiff_t i = 0;  /* counts maximum expand for item */
  while ((s+i)<ms->src_end && singlematch(uchar(*(s+i)), p, ep))
    i++;
  /* keeps trying to match with the maximum repetitions */
  while (i>=0) {
    const char *res = match(ms, (s+i), ep+1);
    if (res) return res;
    i--;  /* else didn't match; reduce 1 repetition to try again */
  }
  return null;
}

static const char *min_expand (LuaMatchState *ms, const char *s,
                                 const char *p, const char *ep) {
  for (;;) {
    const char *res = match(ms, s, ep+1);
    if (res != null)
      return res;
    else if (s<ms->src_end && singlematch(uchar(*s), p, ep))
      s++;  /* try with one more repetition */
    else return null;
  }
}

static const char *start_capture (LuaMatchState *ms, const char *s,
                                    const char *p, int what) {
  const char *res;
  int level = ms->level;
  if (level >= LUA_REGEX_MAXCAPTURES) {
      ms->error = "too many captures";
      return null;
  }
  ms->capture[level].init = s;
  ms->capture[level].len = what;
  ms->level = level+1;
  if ((res=match(ms, s, p)) == null)  /* match failed? */
    ms->level--;  /* undo capture */
  return res;
}

static const char *end_capture (LuaMatchState *ms, const char *s,
                                  const char *p) {
  int l;
  const char *res;
  if(!capture_to_close(ms, &l)) return null;
  ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
  if ((res = match(ms, s, p)) == null)  /* match failed? */
    ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
  return res;
}

static const char *match_capture (LuaMatchState *ms, const char *s, int l) {
  size_t len;
  if(check_capture(ms, &l)){
      len = ms->capture[l].len;
      if ((size_t)(ms->src_end-s) >= len &&
          memcmp(ms->capture[l].init, s, len) == 0)
        return s+len;
  }
  return null;
}

static const char *match (LuaMatchState *ms, const char *s, const char *p) {
  //init: /* using goto's to optimize tail recursion */
  for(;;) {
  if (p == ms->p_end)  /* end of pattern? */
    return s;  /* match succeeded */
  switch (*p) {
    case '(': {  /* start capture */
      if (*(p+1) == ')')  /* position capture? */
        return start_capture(ms, s, p+2, CAP_POSITION);
      else
        return start_capture(ms, s, p+1, CAP_UNFINISHED);
    }
    case ')': {  /* end capture */
      return end_capture(ms, s, p+1);
    }
    case '$': {
      if ((p+1) == ms->p_end)  /* is the `$' the last char in pattern? */
        return (s == ms->src_end) ? s : null;  /* check end of string */
      else break; //goto dflt;
    }
    case L_ESC: {  /* escaped sequences not in the format class[*+?-]? */
      switch (*(p+1)) {
        case 'b': case 'B': { /* balanced string? */ /* EXT */
          s = matchbalance(ms, s, p+2);
          if (s == null) return null;
          p += (*(p + 1) == 'b') ? 4 : 5; /* EXT */ continue; // goto init;  /* else return match(ms, s, p+4); */
        }
        case 'f': {  /* frontier? */
          const char *ep; char previous;
          p += 2;
          if (*p != '['){
            ms->error = "missing " + LUA_QL("[") + " after " +
                               LUA_QL("%%f") + " in pattern";
            return null;
          }
          if(!classend(ms, p, &ep)) return null;  /* points to what is next */
          previous = (s == ms->src_init) ? '\0' : *(s-1);
          if (matchbracketclass(uchar(previous), p, ep-1) ||
             !matchbracketclass(uchar(*s), p, ep-1)) return null;
          p=ep; continue; //goto init;  /* else return match(ms, s, ep); */
        }
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
        case '8': case '9': {  /* capture results (%0-%9)? */
          s = match_capture(ms, s, uchar(*(p+1)));
          if (s == null) return null;
          p+=2; continue; //goto init;  /* else return match(ms, s, p+2) */
        }
        //default: goto dflt;
      }
    }
  }
    //default: dflt:
    {  /* pattern class plus optional suffix */
      const char *ep;
      int m;
      if(!classend(ms, p, &ep)) return null;  /* points to what is next */
      m = s < ms->src_end && singlematch(uchar(*s), p, ep);
      switch (*ep) {
        case '?': {  /* optional */
          const char *res;
          if (m && ((res=match(ms, s+1, ep+1)) != null))
            return res;
          p=ep+1; continue; //goto init;  /* else return match(ms, s, ep+1); */
        }
        case '*': {  /* 0 or more repetitions */
          return max_expand(ms, s, p, ep);
        }
        case '+': {  /* 1 or more repetitions */
          return (m ? max_expand(ms, s+1, p, ep) : null);
        }
        case '-': {  /* 0 or more repetitions (minimum) */
          return min_expand(ms, s, p, ep);
        }
        default: {
          if (!m) return null;
          s++; p=ep; continue; //goto init;  /* else return match(ms, s+1, ep); */
        }
      }
    }
  break;
  }
}

static const char *lmemfind (const char *s1, size_t l1,
                               const char *s2, size_t l2) {
  if (l2 == 0) return s1;  /* empty strings are everywhere */
  else if (l2 > l1) return null;  /* avoids a negative `l1' */
  else {
    const char *init;  /* to search for a `*s2' inside `s1' */
    l2--;  /* 1st char will be checked by `memchr' */
    l1 = l1-l2;  /* `s2' cannot be found after that */
    while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != null) {
      init++;   /* 1st char is already checked */
      if (memcmp(init, s2+1, l2) == 0)
        return init-1;
      else {  /* correct `l1' and `s1' to try again */
        l1 -= init-s1;
        s1 = init;
      }
    }
    return null;  /* not found */
  }
}

/* check whether pattern has no special characters */
static int nospecials (const char *p, size_t l) {
  size_t upto = 0;
  do {
    if (strpbrk(p + upto, SPECIALS))
      return 0;  /* pattern has a special character */
    upto += strlen(p + upto) + 1;  /* may have more after \0 */
  } while (upto <= l);
  return 1;  /* no special chars found */
}

static ptrdiff_t str_find_aux (LuaMatchState *ms, int find, const char *s, ptrdiff_t ls,
                         const char *p, ptrdiff_t lp, ptrdiff_t init, int raw_find,
                         luaregex_func_param *fp, void *udata) {
  ptrdiff_t result;
  ms->error = null;
  if(ls < 0) ls = strlen(s);
  assert(ls >= 0);
  if(lp < 0) lp = strlen(p);
  assert(lp >= 0);
  init = posrelat(init, ls);
  if (init < 0) init = 0;
  else if (init > ls + 1) {  /* start after string's end? */
    return 0; /* cannot find anything */
  }
  ms->src_init = s;
  ms->src_end = s + ls;

//do_again:
for(;;) {
  result = -1; /* not found */
  /* explicit request or no special characters? */
  if (find && (raw_find || nospecials(p, lp))) {
    /* do a plain search */
    const char *s2 = lmemfind(s + init, ls - init, p, lp);
    if (s2) {
      ms->start_pos = ((int)(s2 - s));
      result = ms->end_pos = ms->start_pos+lp;
      ms->level = 0;
    }
  }
  else {
    const char *s1 = s + init;
    int anchor = (*p == '^');
    if (anchor) {
      p++; lp--;  /* skip anchor character */
    }
    ms->p_end = p + lp;
    do {
      const char *res;
      ms->level = 0;
      if ((res=match(ms, s1, p)) != null) {
          ms->start_pos = s1-s;
          result = ms->end_pos = res-s;
          break; //goto eofunc;
      }
    } while (s1++ < ms->src_end && !anchor);
  }
//eofunc:

  if(result >= 0){
      if(!check_capture_all_closed(ms)) return 0;
      if(fp && fp(ms, udata, null)) {
          init = result;
          if (init == ms->start_pos) ++init;  /* empty match? go at least one position */
          if (init < ls) continue; //goto do_again;
      }
  }

  break;
}

  return result > 0 ? ms->start_pos : result; //returning the start position
}

int main ()
{
    LuaMatchState ms;

    printf ("lua-regex!\n");
    printf("%d\n", posrelat(-10, 12));

    printf("match_class : %d\n", match_class('f', 'x'));

    char const* p1 = " foo bar 100 baz";
    const char * p2 = "baz";
    const char *found = lmemfind(p1, strlen(p1), p2, strlen(p2));

    printf("found : %s\n", found);

    ptrdiff_t dt = str_find_aux(&ms, 1, p1, strlen(p1), p2, strlen(p2), 0, 1, null, null);
    printf("found : %d\n", dt);

    return 0;
}
mingodad commented 3 years ago

And here is my initial implementation of isalpha, ...:

diff --git a/src/jnc_ext/jnc_std/jnc/std_globals.jnc b/src/jnc_ext/jnc_std/jnc/std_globals.jnc
index 951cc88a..079a7e3c 100644
--- a/src/jnc_ext/jnc_std/jnc/std_globals.jnc
+++ b/src/jnc_ext/jnc_std/jnc/std_globals.jnc
@@ -442,6 +442,17 @@ intptr_t cdecl printf(
    ...
    );

+bool isalpha(uint32_t c);
+bool iscntrl(uint32_t c);
+bool isdigit(uint32_t c);
+bool isgraph(uint32_t c);
+bool islower(uint32_t c);
+bool ispunct(uint32_t c);
+bool isspace(uint32_t c);
+bool isupper(uint32_t c);
+bool isalnum(uint32_t c);
+bool isxdigit(uint32_t c);
+
 //! @}

 namespace std {
diff --git a/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp b/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
index 92bdfa16..003a871f 100644
--- a/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
+++ b/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
@@ -148,6 +148,56 @@ strtoul(
    return strtot<uint64_t>(::_strtoui64, ptr, endPtr, radix);
 }

+bool isAlpha(uint32_t c)
+{
+    return enc::utfIsLetter(c);
+}
+
+bool isCntrl(uint32_t c)
+{
+    return iscntrl(c);
+}
+
+bool isDigit(uint32_t c)
+{
+    return enc::utfIsDigit(c);
+}
+
+bool isGraph(uint32_t c)
+{
+    return isgraph(c);
+}
+
+bool isLower(uint32_t c)
+{
+    return enc::utfIsLowerCase(c);
+}
+
+bool isPunct(uint32_t c)
+{
+    return enc::utfIsPunctuation(c);
+}
+
+bool isSpace(uint32_t c)
+{
+    return enc::utfIsSpace(c);
+}
+
+bool isUpper(uint32_t c)
+{
+    return enc::utfIsUpperCase(c);
+}
+
+bool isAlnum(uint32_t c)
+{
+    return enc::utfIsLetterOrDigit(c);
+}
+
+bool isXdigit(uint32_t c)
+{
+    return isxdigit(c);
+}
+
 uint32_t
 toUpper(uint32_t c)
 {
@@ -679,6 +729,17 @@ JNC_BEGIN_LIB_FUNCTION_MAP(jnc_StdLib)
    JNC_MAP_OVERLOAD(setError_1)
    JNC_MAP_FUNCTION("std.format",       format)

+   JNC_MAP_FUNCTION("isalpha",   isAlpha)
+   JNC_MAP_FUNCTION("iscntrl",   isCntrl)
+   JNC_MAP_FUNCTION("isdigit",   isDigit)
+   JNC_MAP_FUNCTION("isgraph",   isGraph)
+   JNC_MAP_FUNCTION("islower",   isLower)
+   JNC_MAP_FUNCTION("ispunct",   isPunct)
+   JNC_MAP_FUNCTION("isspace",   isSpace)
+   JNC_MAP_FUNCTION("isupper",   isUpper)
+   JNC_MAP_FUNCTION("isalnum",   isAlnum)
+   JNC_MAP_FUNCTION("isxdigit",  isXdigit)
+
    JNC_MAP_FUNCTION("strlen",   jnc::strLen)
    JNC_MAP_FUNCTION("strcmp",   strCmp)
    JNC_MAP_FUNCTION("strncmp",  strnCmp)
vovkos commented 3 years ago

My lua-regex.jnc so far (to show what I'm trying to achieve) that has this error:

jancy "lua-regex.jnc"
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(303,50): binary '+' cannot be applied to 'char [9]' and 'char*'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(182,48): unexpected 'identifier' in 'literal'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): undeclared identifier 'INT_MAX'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): unable to recover from previous error(s)
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(42,1172): binary '+' cannot be applied to 'char [2]' and 'char const*'
5 error(s); compilation failed

You are trying to add char pointers/char arrays, and that doesn't work -- just like in C.

For building strings please use:

vovkos commented 3 years ago

And here is my initial implementation of isalpha, ...:

A PR with those standard C runtime functions would be very welcome.