cwbaker / lalr

LALR(1) parser for C++
MIT License
78 stars 11 forks source link

Unable to accept the literal '\\' #40

Open mingodad opened 1 year ago

mingodad commented 1 year ago

While testing a grammar for https://github.com/potassco/clingo/blob/master/libgringo/src/input/nongroundgrammar.yy I found that lalr do not handle the literal '\\' see example and possible fix bellow:

escaped {

%whitespace "[ \t\r\n]*";

    ch : '\\' ;
}

Error:

lalr (5:0): ERROR: unterminated literal
lalr (5:0): ERROR: expected ';' not found
lalr (5:0): ERROR: expected '}' not found
Error compiling grammar. Error count = 3

Possible fix:

bool GrammarParser::match_literal()
{
    match_whitespace_and_comments();
    if ( match("'") )
    {
        bool escaped = false;
        const char* position = position_;
        while ( position != end_ && (*position != '\'' || escaped) && !is_new_line(position) )
        {
            escaped = *position == '\\';
            ++position;
            if(*position == '\\' && escaped) //!!!<<<<< this new block seems to fix this issue
            {
                ++position;
                escaped = false;
            }
    }
        if ( position == end_ || !is_new_line(position) )
        {
            lexeme_.assign( position_, position );
            position_ = position;
            expect( "'" );
            return true;
        }
        error( line_, LALR_ERROR_UNTERMINATED_LITERAL, "unterminated literal" );
        return false;
    }
    return false;
}
mingodad commented 1 year ago

The same happen in GrammarParser::match_regex .

mingodad commented 1 year ago

Also an empty literal/regex should be an error instead of program termination by an assert, see possible fix bellow.

enum ErrorCode
{
    PARSER_ERROR_NONE, ///< No %error.
    LALR_ERROR_SYNTAX, ///< Syntax %error occured while parsing input.
    LALR_ERROR_UNTERMINATED_LITERAL, ///< Unterminated literal in an lalr grammar.
    LALR_ERROR_EMPTY_LITERAL, ///< Empty literal in an lalr grammar.
    LEXER_ERROR_MISSING_ACTION_HANDLER, ///< A lexer action hasn't been bound to a function.
    LEXER_ERROR_SYNTAX, ///< Syntax %error occured while parsing some input.
    LEXER_ERROR_SYMBOL_CONFLICT, ///< A lexer state matches more than one symbol.
    LEXER_ERROR_LEXICAL_ERROR, ///< A lexical error occured while scanning an input sequence.
    PARSER_ERROR_OPENING_FILE_FAILED, ///< Opening a grammar file failed.
    PARSER_ERROR_PARSING_FAILED, ///< Parsing a grammar failed.
    PARSER_ERROR_UNEXPECTED, ///< An unexpected %error occured.
    PARSER_ERROR_SYNTAX, ///< Syntax %error occured while parsing some input.
    PARSER_ERROR_PARSE_TABLE_CONFLICT, ///< A shift-reduce or reduce-reduce conflict was found in the parse table.
    PARSER_ERROR_UNDEFINED_SYMBOL, ///< A grammar symbol is referenced but not defined.
    PARSER_ERROR_UNREFERENCED_SYMBOL, ///< A grammar symbol is defined but not referenced.
    PARSER_ERROR_ERROR_SYMBOL_ON_LEFT_HAND_SIDE, ///< The 'error' symbol has been used on the left hand side of a production.
    PARSER_ERROR_DUPLICATE_ASSOCIATION_ON_IMPLICIT_TERMINAL ///< Both implicit terminal forms specify associativity and precedence.
};
bool GrammarParser::match_literal()
{
    match_whitespace_and_comments();
    if ( match("'") )
    {
        bool escaped = false;
        const char* position = position_;
        while ( position != end_ && (*position != '\'' || escaped) && !is_new_line(position) )
        {
            escaped = *position == '\\';
            ++position;
            if(*position == '\\' && escaped)
            {
                ++position;
                escaped = false;
            }
        }
        if ( position == end_ || !is_new_line(position) )
        {
            lexeme_.assign( position_, position );
            position_ = position;
            expect( "'" );
            if(lexeme_.size() == 0)
            {
                error( line_, LALR_ERROR_EMPTY_LITERAL, "empty literal" );
                return false;
            }
            return true;
        }
        error( line_, LALR_ERROR_UNTERMINATED_LITERAL, "unterminated literal" );
        return false;
    }
    return false;
}

bool GrammarParser::match_regex()
{
    match_whitespace_and_comments();
    if ( match("\"") )
    {
        bool escaped = false;
        const char* position = position_;
        while ( position != end_ && (*position != '"' || escaped) )
        {
            escaped = *position == '\\';
            ++position;
            if(*position == '\\' && escaped)
            {
                ++position;
                escaped = false;
            }
        }
        lexeme_.assign( position_, position );
        position_ = position;
        expect( "\"" );
        if(lexeme_.size() == 0)
        {
            error( line_, LALR_ERROR_EMPTY_LITERAL, "empty regex" );
            return false;
        }
        return true;
    }
    return false;
}