BenHanson / parsertl14

C++14 version of parsertl
32 stars 4 forks source link

Help needed to implement dump parse tree #8

Closed mingodad closed 1 year ago

mingodad commented 1 year ago

I'm trying to implement a generic dump parse tree but I'm having trouble to understand the details of the parser/state machine.

And I think that I found a bug because there is only 9 rules (0-8) but there is an entry in parsertl::state_machine with index 9 that is one passing the end shift 9:0 <- $ => *.

Any help is welcome !

#include <parsertl/debug.hpp>
//#include <lexertl/debug.hpp>
#include <parsertl/generator.hpp>
#include <iostream>
#include <parsertl/lookup.hpp>

struct ParseTreeUserData {
    std::vector<ParseTreeUserData> children;
    int symbol_id;
    std::string value; ///< The value at this node (empty if this node's symbol is non-terminal).
    ParseTreeUserData():children(0),symbol_id(-1) {};
    ParseTreeUserData(int id):children(0),symbol_id(id) {};
    ParseTreeUserData(int id, const char*v):children(0),symbol_id(id), value(v) {};
};

static void parsetree_indent( int level )
{
    for ( int i = 0; i < level; ++i )
    {
        printf( " |" );
    }
}

static void print_parsetree( const ParseTreeUserData& ast, parsertl::rules::string_vector& symbols, int level )
{
    if(ast.symbol_id >= 0)
    {
        parsetree_indent( level );
        if(!ast.value.empty()) //it's a terminal
        {
            printf("%s -> %s\n", symbols[ast.symbol_id].c_str(), ast.value.c_str());
        }
        else
        {
            printf("%s\n", symbols[ast.symbol_id].c_str());
        }
    }

    for (const auto& child : ast.children)
    {
        print_parsetree( child, symbols, ast.symbol_id >= 0 ? (level + 1) : level );
    }
}

static void dump_parse_tree(parsertl::match_results& results, lexertl::citerator& iter_lex,
        parsertl::state_machine& gsm, parsertl::rules::string_vector& symbols)
{
    std::cout << "Rules size = " << gsm._rules.size() << "\n";
    std::cout << "Symbols size = " << symbols.size() << "\n";
    for(const auto& symbol : symbols) { std::cout << symbol << "\n";}
    std::stack<ParseTreeUserData> syn_tree;
    bool parse_done=false;
    for(;;)
    {
        const auto& sm_entry = results.entry;
        const auto& tok_start = iter_lex->first;
        const auto& tok_end = iter_lex->second;
        // Debug info
        switch (sm_entry.action)
        {
            case parsertl::action::error:
                throw std::runtime_error("Parser error");
                break;
            case parsertl::action::shift:
            {
                const std::string str(tok_start, tok_end);
                const auto& [lhs_id, rhs_id_vec] = gsm._rules[sm_entry.param];
                std::cout << "shift " << sm_entry.param
            << ":" << lhs_id
                        << " <- " << symbols[lhs_id];
                if(tok_start != tok_end)
                {
                    const std::string str(tok_start, tok_end);
                    std::cout << " => " << str;
                }
                std::cout << '\n';
                auto& ud = syn_tree.emplace(lhs_id);
                ud.value = str;
                break;
            }
            case parsertl::action::reduce:
            {
                const auto& [lhs_id, rhs_id_vec] = gsm._rules[sm_entry.param];
                ParseTreeUserData ud(lhs_id);

                std::cout << "reduce "<< rhs_id_vec.size() << " elements by: " << symbols[lhs_id] << " ->";

                if (rhs_id_vec.empty())
                {
                    std::cout << " %empty";
                }
                else
                {
                    for (auto rhs_id : rhs_id_vec)
                    {
                        std::cout << ' ' << symbols[rhs_id];
                        ParseTreeUserData& child = ud.children.emplace_back(syn_tree.top());
                        child.symbol_id = rhs_id;
                        syn_tree.pop();
                    }
                    if(tok_start != tok_end)
                    {
                        const std::string str(tok_start, tok_end);
                        std::cout << " => " << str;
                    }
                }
                syn_tree.emplace(ud);
                std::cout << '\n';
                break;
            }
            case parsertl::action::go_to:
                std::cout << "goto " << sm_entry.param << '\n';
                break;
            case parsertl::action::accept:
                for(;;) //do a hack cleanup
                {
                    ParseTreeUserData& ud = syn_tree.top();
                    if(ud.children.size() == 0 && syn_tree.size() > 1)
                    {
                        syn_tree.pop();
                        continue;
                    }
                    break;
                }
                std::cout << "accept\n";
                parse_done = true;
                break;
        }
        if(parse_done) break;
        parsertl::lookup(iter_lex, gsm, results);
    }

    if (!syn_tree.empty())
    {
        print_parsetree(syn_tree.top(), symbols, 0);
    }
}

int main(int argc, char *argv[])
{
    parsertl::rules grules;
    parsertl::state_machine gsm;
    parsertl::rules::string_vector symbols;
    lexertl::rules lrules;
    lexertl::state_machine lsm;

    try
    {
        // Calculator from flex & bison book.
        grules.token("INTEGER");
        grules.left("'+' '-'");
        grules.left("'*' '/'");
        grules.precedence("UMINUS");

        grules.push("start", "exp");
        grules.push("exp", "exp '+' exp");
        grules.push("exp", "exp '-' exp");
        grules.push("exp", "exp '*' exp");
        grules.push("exp", "exp '/' exp");
        grules.push("exp", "'(' exp ')'");
        grules.push("exp", "'-' exp %prec UMINUS");
        grules.push("exp", "INTEGER");

        parsertl::debug::dump(grules, std::cout);
        parsertl::generator::build(grules, gsm);
        grules.terminals(symbols);
        grules.non_terminals(symbols);

        lrules.insert_macro("c_comment", "[/]{2}.*|[/][*](?s:.)*?[*][/]");
        lrules.insert_macro("escape", "\\\\(.|x[0-9A-Fa-f]+|c[@a-zA-Z])");

        lrules.push("[+]",  grules.token_id("'+'"));
        lrules.push("-",    grules.token_id("'-'"));
        lrules.push("[*]",  grules.token_id("'*'"));
        lrules.push("[/]",  grules.token_id("'/'"));
        lrules.push("\\d+", grules.token_id("INTEGER"));
        lrules.push("[(]",  grules.token_id("'('"));
        lrules.push("[)]",  grules.token_id("')'"));
        lrules.push("\\s+", lsm.skip());
        //lexertl::debug::dump(lrules, std::cout, false);
        lexertl::generator::build(lrules, lsm);

        std::string expr("1 + 2 * -3");
        lexertl::citerator iter_lex(expr.c_str(), expr.c_str() + expr.size(), lsm);
        parsertl::match_results results(iter_lex->id, gsm);
/*
    while(iter_lex->id != 0) {
        const std::size_t line = 1;
        const std::size_t column = 1;
        std::cout << line << ":" << column << ":" << iter_lex->id << ": " <<
            iter_lex->str() << "\n"; ++iter_lex;
    }
*/
        dump_parse_tree(results, iter_lex, gsm, symbols);
    }
    catch (const std::exception &e)
    {
        std::cout << "Exception: " << e.what() << '\n';
    }

    return 0;
}

Output:

%token INTEGER '(' ')'
%left '+' '-'
%left '*' '/'
%precedence UMINUS
%%

start: exp;

exp: exp '+' exp
   | exp '-' exp
   | exp '*' exp
   | exp '/' exp
   | '(' exp ')'
   | '-' exp %prec UMINUS
   | INTEGER;

%%
Rules size = 9
Symbols size = 12
$
INTEGER
'+'
'-'
'*'
'/'
UMINUS
'('
')'
start
exp
$accept
shift 5:10 <- exp => 1
reduce 1 elements by: exp -> INTEGER => +
goto 2
shift 7:10 <- exp => +
shift 5:10 <- exp => 2
reduce 1 elements by: exp -> INTEGER => *
goto 13
shift 9:0 <- $ => *
shift 4:10 <- exp => -
shift 5:10 <- exp => 3
reduce 1 elements by: exp -> INTEGER
goto 12
reduce 2 elements by: exp -> '-' exp
goto 15
reduce 3 elements by: exp -> exp '*' exp
goto 13
reduce 3 elements by: exp -> exp '+' exp
goto 2
reduce 1 elements by: start -> exp
goto 1
shift 6:10 <- exp
accept
start
 |exp
 | |exp
 | | |exp
 | | | |'-'
 | | | | |INTEGER -> 3
 | | | |exp -> -
 | | |'*' -> *
 | | |exp
 | | | |INTEGER -> 2
 | |'+' -> +
 | |exp
 | | |INTEGER -> 1

Valgrind output:

valgrind ./syntax-tree-nb 
==25862== Memcheck, a memory error detector
==25862== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
==25862== Using Valgrind-3.21.0 and LibVEX; rerun with -h for copyright info
==25862== Command: ./syntax-tree-nb
==25862== 
%token INTEGER '(' ')'
%left '+' '-'
...
goto 13
==25862== Use of uninitialised value of size 8
==25862==    at 0x4F75763: ??? (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x4F75DC0: std::ostreambuf_iterator<char, std::char_traits<char> > std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::_M_insert_int<unsigned long>(std::ostreambuf_iterator<char, std::char_traits<char> >, std::ios_base&, char, unsigned long) const (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x4F83BE5: std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long) (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x112A45: dump_parse_tree(parsertl::basic_match_results<parsertl::basic_state_machine<unsigned short> >&, lexertl::iterator<char const*, lexertl::basic_state_machine<char, unsigned short>, lexertl::match_results<char const*, unsigned short, 95ul> >&, parsertl::basic_state_machine<unsigned short>&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >&) (syntax-tree.cpp:68)
==25862==    by 0x113C71: main (syntax-tree.cpp:193)
==25862== 
==25862== Conditional jump or move depends on uninitialised value(s)
==25862==    at 0x4F75775: ??? (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x4F75DC0: std::ostreambuf_iterator<char, std::char_traits<char> > std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::_M_insert_int<unsigned long>(std::ostreambuf_iterator<char, std::char_traits<char> >, std::ios_base&, char, unsigned long) const (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x4F83BE5: std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long) (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x112A45: dump_parse_tree(parsertl::basic_match_results<parsertl::basic_state_machine<unsigned short> >&, lexertl::iterator<char const*, lexertl::basic_state_machine<char, unsigned short>, lexertl::match_results<char const*, unsigned short, 95ul> >&, parsertl::basic_state_machine<unsigned short>&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >&) (syntax-tree.cpp:68)
==25862==    by 0x113C71: main (syntax-tree.cpp:193)
==25862== 
==25862== Use of uninitialised value of size 8
==25862==    at 0x4F95970: std::basic_ostream<char, std::char_traits<char> >& std::operator<< <char, std::char_traits<char>, std::allocator<char> >(std::basic_ostream<char, std::char_traits<char> >&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.32)
==25862==    by 0x112A81: dump_parse_tree(parsertl::basic_match_results<parsertl::basic_state_machine<unsigned short> >&, lexertl::iterator<char const*, lexertl::basic_state_machine<char, unsigned short>, lexertl::match_results<char const*, unsigned short, 95ul> >&, parsertl::basic_state_machine<unsigned short>&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >&) (syntax-tree.cpp:69)
==25862==    by 0x113C71: main (syntax-tree.cpp:193)
==25862== 
shift 9:0 <- $ => *
shift 4:10 <- exp => -
...
 | | |INTEGER -> 1
==25862== 
==25862== HEAP SUMMARY:
==25862==     in use at exit: 0 bytes in 0 blocks
==25862==   total heap usage: 11,136 allocs, 11,136 frees, 891,670 bytes allocated
==25862== 
==25862== All heap blocks were freed -- no leaks are possible
==25862== 
==25862== Use --track-origins=yes to see where uninitialised values come from
==25862== For lists of detected and suppressed errors, rerun with: -s
==25862== ERROR SUMMARY: 3 errors from 3 contexts (suppressed: 0 from 0)
mingodad commented 1 year ago

I'm basing the code shown above on Trace parser operation from http://benhanson.net/parsertl.html and expanded it to show the rule(lhs) and the value been shifted by using code derived from the case parsertl::action::reduce and maybe I'm missing something there.

BenHanson commented 1 year ago

The shift param of 9 is referring to the state machine state (gsm._table[9])

    case parsertl::action::shift:
    {
        const std::string str(tok_start, tok_end);

        std::cout << "shift " << sm_entry.param
            << ":" << iter_lex->id
            << " <- " << symbols[iter_lex->id];
        if (tok_start != tok_end)
        {
            const std::string str(tok_start, tok_end); 
            std::cout << " => " << str;
        }
        std::cout << '\n';
        auto& ud = syn_tree.emplace(iter_lex->id);
        ud.value = str;
        break;
    }

With that change it at least doesn't crash.

mingodad commented 1 year ago

Thank you ! But on your example Trace parser operation you use it.

It's an error there too ?

case parsertl::action::reduce:
                {
                    parsertl::state_machine::id_type_vector_pair &pair_ =
                        gsm_._rules[results_.entry.param];

                    reduce_(results_.reduce_id());
BenHanson commented 1 year ago

shift and reduce are not the same.

I recommend stepping through my example and understanding the use of all the containers and values.

mingodad commented 1 year ago

Thank you ! I'm on it.