Hejsil / mecha

A parser combinator library for Zig
MIT License
453 stars 20 forks source link

Infinite recursion - i.e. how to use ref()? #51

Open cdcarter opened 1 year ago

cdcarter commented 1 year ago

I'm trying to write a very simple arithmetic expression parser. I'm not even trying to have any specific operator precedence yet, just going left to right. I'm ending up in an infinite loop, and I suspect it's because I'm misunderstanding ref.

Here's the data structure we're trying to parse into.

const Expression = union(enum) {
    value: u16,
    binOp: BinOp,

    const BinOp = struct {
        lhs: *Expression,
        operator: Op,
        rhs: *Expression,
    };

    const Op = enum { @"+", @"-", @"*", @"/" };
};

and a conversion function ...

fn toExpression(allocator: std.mem.Allocator, resultType: anytype) !*Expression {
    var x = try allocator.create(Expression);
    x.* = switch (@TypeOf(resultType)) {
        Expression.BinOp => Expression{ .binOp = resultType },
        u16 => Expression{ .value = resultType },
        *Expression => resultType.*, // probbaly shouldn't allocate again here but we're being fine.
        else => std.debug.panic("Unexpected type to toExpression {}", .{ @typeName(@TypeOf(resultType)) }),
    };
    return x;
}

here's the parser definition(s)

const ws = mecha.oneOf(.{
    mecha.utf8.char(0x0020),
    mecha.utf8.char(0x000A),
    mecha.utf8.char(0x000D),
    mecha.utf8.char(0x0009),
}).many(.{ .collect = false }).discard();

const integer = mecha.combine(.{ mecha.int(u16, .{ .parse_sign = false }), ws });

// TODO make these utf codepoints and then convert them before toEnuming. maybe fix up the toEnummer to not freak out on singel characters.
const plus = mecha.string("+");
const minus = mecha.string("-");
const star = mecha.string("*");
const slash = mecha.string("/");

const operator = mecha.oneOf(.{ plus, minus, star, slash }).convert(mecha.toEnum(Expression.Op));
const binOp = mecha.combine(.{
    mecha.ref(expressionRef),
    operator,
    mecha.ref(expressionRef),
}).map(mecha.toStruct(Expression.BinOp)).convert(toExpression);

fn expressionRef() mecha.Parser(*Expression) {
    return expression;
}

const expression = mecha.oneOf(.{
    binOp,
    integer.convert(toExpression),
});

A simple usage end in an infinite recursion

test "expr" {
    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
    const a = arena.allocator();
    const b = try expression.parse(a, "200 + 100");
    _ = b;
    // std.debug.print("\n {s}, {}", .{ b.rest, b.value });

    arena.deinit();
}
(lldb) bt all
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=2, address=0x16f603fe0)
  * frame #0: 0x00000001000062f4 test`mecha.map__struct_4549.parse(allocator=<unavailable>, str=(ptr = <read memory from 0x16f603f28 failed (0 of 8 bytes read)>, len = <read memory from 0x16f603f30 failed (0 of 8 bytes read)>)) at mecha.zig:602
    frame #1: 0x0000000100006738 test`mecha.convert__struct_4556.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:487:39
    frame #2: 0x000000010000096c test`mecha.oneOf__struct_5182.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:406:28
    frame #3: 0x0000000100004f24 test`mecha.ref__struct_3703.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:890:32
    frame #4: 0x0000000100006090 test`mecha.combine__struct_4474.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:354:43
    frame #5: 0x0000000100006328 test`mecha.map__struct_4549.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:603:39
    frame #6: 0x0000000100006738 test`mecha.convert__struct_4556.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:487:39
    frame #7: 0x000000010000096c test`mecha.oneOf__struct_5182.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:406:28
    frame #8: 0x0000000100004f24 test`mecha.ref__struct_3703.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:890:32
    frame #9: 0x0000000100006090 test`mecha.combine__struct_4474.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:354:43
    frame #10: 0x0000000100006328 test`mecha.map__struct_4549.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:603:39
    frame #11: 0x0000000100006738 test`mecha.convert__struct_4556.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:487:39
    frame #12: 0x000000010000096c test`mecha.oneOf__struct_5182.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:406:28
    frame #13: 0x0000000100004f24 test`mecha.ref__struct_3703.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:890:32
    frame #14: 0x0000000100006090 test`mecha.combine__struct_4474.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:354:43
    frame #15: 0x0000000100006328 test`mecha.map__struct_4549.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:603:39
    frame #16: 0x0000000100006738 test`mecha.convert__struct_4556.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:487:39
    frame #17: 0x000000010000096c test`mecha.oneOf__struct_5182.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:406:28
    frame #18: 0x0000000100004f24 test`mecha.ref__struct_3703.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:890:32
    frame #19: 0x0000000100006090 test`mecha.combine__struct_4474.parse(allocator=mem.Allocator @ 0x000000016fdfee48, str=(ptr = "200 + 100", len = 9)) at mecha.zig:354:43
    frame #20: 0x0000000100006328 test`mecha.map__struct_4549.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:603:39
    frame #21: 0x0000000100006738 test`mecha.convert__struct_4556.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:487:39
    frame #22: 0x000000010000096c test`mecha.oneOf__struct_5182.parse(allocator=<unavailable>, str=(ptr = "200 + 100", len = 9)) at mecha.zig:406:28

and so on, for many tens of thousands of frames...

I'm simply unsure what to do next to troubleshoot. any advice appreciated.

Hejsil commented 1 year ago

Yea, this is a quite common problem when writing parsers called Left Recursion. You basically have the rule:

expr = binOp
binOp = expr + expr

So what happens when you try parsing 1 + 1?

Well:

expr -> binOp -> expr + expr
                  |
                  +> expr -> binOp -> expr + expr
                                       |
                                       +> expr -> binOp -> expr + expr
                                                             |
                                                             + ...

To solve this, you could change the rules to be something like this:

expr = binOp
binOp = ( number + )* number // A list of `number +` followed by a `number`

This can be written in mecha with the mecha.many function, but the problem is when you need to have multiple operators. I can think of a few none elegant solution in my head right now, but I would have to play around to see what would be the best way to handle that.

On a side note, I'm pretty sure other parser combinator libraries have functions for this specific pattern and mecha probably should have that too.

cdcarter commented 1 year ago

Well when you point it out, it seems obvious!

I've got something that's pretty much working out of

const integer = mecha.combine(.{ mecha.int(u16, .{ .parse_sign = false }), ws });
const plus = mecha.string("+");
const minus = mecha.string("-");
const star = mecha.string("*");
const slash = mecha.string("/");
const operator = mecha.oneOf(.{ plus, minus, star, slash }).convert(mecha.toEnum(Expression.Op));

const base_expression = mecha.oneOf(.{integer.convert(toExpression)}); // will eventually have identifier too
const binOp = mecha.combine(.{
    base_expression,
    operator,
    mecha.ref(expressionRef),
}).map(mecha.toStruct(Expression.BinOp)).convert(toExpression);

fn expressionRef() mecha.Parser(*Expression) {
    return expression;
}

const expression = mecha.oneOf(.{ binOp, base_expression });

On a side note, I'm pretty sure other parser combinator libraries have functions for this specific pattern and mecha probably should have that too.

Indeed. I am pretty new to combinator/PEG style parsing but I notice that e.g. pyparsing has a helper called infix_notation that directly builds up an operator precedence table. That's the sort of thing I'd like to evolve this into, over time.

I also got a version of this grammar working, stolen from wikipedia

// Expr    ← Sum
// Sum     ← Product (('+' / '-') Product)*
// Product ← Power (('*' / '/') Power)*
// Power   ← Value ('^' Power)?
// Value   ← [0-9]+ / '(' Expr ')'

I haven't figured out how to convert this one into an AST (... yet!, I'll get there!) instead of just discarding, but it works quite well.