Closed egil closed 2 years ago
My current attempt looks like this:
using Pidgin;
using Scim.ScimFilters;
using static Pidgin.Parser;
using static Pidgin.Parser<char>;
namespace Scim.Pidgin;
public class ScimFilterParser
{
/// <summary>
/// A parser that parses and returns a single alpha character case insensitive.
/// </summary>
/// <remarks><code>
/// A-Z / a-z
/// </remarks>
private static Parser<char, char> Alpha { get; }
= CIOneOf('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z');
/// <summary>
/// A parser that parses and returns a single alpha numerical character case insensitive.
/// </summary>
/// <remarks><code>
/// A-Z / a-z / 0-9
/// </code></remarks>
private static Parser<char, char> AlphaNum { get; }
= OneOf(Alpha, Digit);
/// <remarks><code>
/// DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
/// </code></remarks>
private static Parser<char, char> HexDig { get; }
= OneOf(Digit).Or(OneOf('A', 'B', 'C', 'D', 'E', 'F'));
/// <remarks><code>
/// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
/// pct-encoded = "%" HEXDIG HEXDIG
/// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
/// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
/// </code></remarks>
private static Parser<char, char> PChar { get; }
= OneOf(
AlphaNum, // unreserved
OneOf('-', '.', '_', '~', // unreserved
'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', // sub-delims
':', '@'),
Map((_, a, b) => (char)Convert.ToUInt32($"{a}{b}", fromBase: 16), Char('%'), HexDig, HexDig)); // pct-encoded
/// <summary>
/// A URN parser. Note, only supports <c>assigned-name</c> form.
/// </summary>
/// <remarks><code>
/// assigned-name = "urn" ":" NID ":" NSS
/// NID = (alphanum) 0*30(ldh) (alphanum)
/// ldh = alphanum / "-"
/// NSS = pchar *(pchar / "/")
/// </code></remarks>
public static Parser<char, string> Urn { get; }
= CreateUrn().Labelled("URN");
public static Parser<char, string> CreateUrn()
{
var start = CIString("urn").Before(Char(':'));
var nid = Map(
static (start, rest) => start + rest,
AlphaNum,
OneOf(AlphaNum, Char('-'))
.ManyString()
.Where(x => x.Length < 31 && x.Length > 0 && x[^1] != '-')
).Before(Char(':'));
var nss = Map(
static (start, rest) => start + rest,
PChar,
OneOf(PChar, Char('/')).ManyString());
return Map(
static (start, nid, nss) => $"{start}:{nid}:{nss}",
start,
nid,
nss);
}
/// <summary>
/// A nameChar parser, that parses a single character.
/// </summary>
/// <remarks><code>
/// "-" / "_" / DIGIT / ALPHA
/// </code></remarks>
public static Parser<char, char> NameChar { get; }
= OneOf(AlphaNum, Char('_'), Char('-'))
.Labelled("name character");
/// <summary>
/// An attribute name (ATTRNAME) parser.
/// </summary>
/// <remarks><code>
/// ALPHA *(nameChar)
/// </code></remarks>
public static Parser<char, string> AttrName { get; }
= Map(
static (first, rest) => first + rest,
Alpha,
NameChar.ManyString())
.Labelled("attribute name");
/// <summary>
/// A sub-attribute of a complex attribute (subAttr) parser.
/// </summary>
/// <remarks><code>
/// "." ATTRNAME
/// </code></remarks>
public static Parser<char, string> SubAttr { get; }
= Map(
static (period, subAttr) => period + subAttr,
Char('.'),
AttrName)
.Labelled("sub attribute");
private static Parser<char, string> AttrPathWithoutUrn
= Map(static (attr, subAttrs) => attr + subAttrs,
AttrName,
SubAttr.ManyString());
/// <remarks><code>
/// attrPath = [URI ":"] ATTRNAME *1subAttr
/// ; SCIM attribute name
/// ; URI is SCIM "schema" URI
/// </code></remarks>
public static Parser<char, AttrPath> AttrPath { get; }
= Try(
// Because the NSS part of URN also includes
// AttrPathWithoutUrn, it will consume the
// `ATTRNAME *1subAttr` part of attribute path.
// The MapWithInput gets the candidate attr path
// from the URN parse result and validates it using
// the AttrPathWithoutUrn parser.
Urn.MapWithInput<AttrPath?>((urn, _) =>
{
var index = urn.LastIndexOf(':');
var attrPath = urn[(index + 1)..];
return AttrPathWithoutUrn.Parse(attrPath).Success
? new AttrPath(urn[..index].ToString(), attrPath.ToString())
: null;
})
.Assert(x => x.HasValue)
.Select(x => x!.Value))
.Or(AttrPathWithoutUrn.Select(x => new AttrPath(x)))
.Labelled("attribute path");
/// <remarks><code>
/// compareOp = "eq" / "ne" / "co" /
/// "sw" / "ew" /
/// "gt" / "lt" /
/// "ge" / "le"
/// </code></remarks>
public static Parser<char, Operation> CompareOp { get; }
= OneOf(
// eq or ew
CIChar('e').Then(
CIChar('q').ThenReturn(Operation.Equal)
.Or(CIChar('w').ThenReturn(Operation.EndsWith))
),
CIString("ne").Select(_ => Operation.NotEqual),
// gt or ge
CIChar('g').Then(
CIChar('t').ThenReturn(Operation.GreaterThan)
.Or(CIChar('e').ThenReturn(Operation.GreaterThanOrEqual))
),
// lt or le
CIChar('l').Then(
CIChar('t').ThenReturn(Operation.LessThan)
.Or(CIChar('e').ThenReturn(Operation.LessThanOrEqual))
),
CIString("co").Select(_ => Operation.Contains),
CIString("sw").Select(_ => Operation.StartsWith),
CIString("pr").Select(_ => Operation.Present))
.Labelled("compare operation");
/// <remarks><code>
/// compValue = false / null / true / number / string
/// ; rules from JSON (RFC 7159)
/// </code></remarks>
public static Parser<char, Value> CompValue { get; }
= CreateCompValue().Labelled("compare value");
private static Parser<char, Value> CreateCompValue()
{
var booleans = CIString("false").Or(CIString("true"))
.Select(x => new Value(ValueKind.Boolean, x.ToLowerInvariant()));
var nulls = CIString("null")
.Select(_ => new Value(ValueKind.Null, null));
var quote = '"';
var strings = Token(c => c != quote).ManyString().Between(Char(quote))
.Select(x => new Value(ValueKind.String, x));
var numbers = CreateNumberParser();
return OneOf(numbers, strings, booleans, nulls);
}
/// <remarks><code>
/// number = [ minus ] int [ frac ] [ exp ]
/// decimal-point = %x2E ; .
/// digit1-9 = %x31-39 ; 1-9
/// e = %x65 / %x45 ; e E
/// exp = e [ minus / plus ] 1*DIGIT
/// frac = decimal-point 1*DIGIT
/// int = zero / ( digit1-9 *DIGIT )
/// minus = %x2D ; -
/// plus = %x2B ; +
/// zero = %x30 ; 0
/// </code></remarks>
private static Parser<char, Value> CreateNumberParser()
{
var isOneToNineDigit = Parser<char>.Token((char c) => c != '0' && char.IsDigit(c));
var nonLeadingZeroInteger = Map((x, rest) => x + rest, isOneToNineDigit, Digit.ManyString());
var plus = Char('+');
var minus = Char('-');
var decimalPoint = Char('.');
var zero = Char('0');
var singleZero = zero.Before(Not(Digit)).Select(_ => "0");
var integer = nonLeadingZeroInteger.Or(singleZero);
var frac = Map(
static (decimalPoint, digets) => decimalPoint + digets,
decimalPoint,
nonLeadingZeroInteger);
var exp = Map(
(e, plusMinus, num) => plusMinus.Match(pm => $"{e}{pm}{num}", () => e + num),
CIChar('E'),
plus.Or(minus).Optional(),
nonLeadingZeroInteger);
return Map(
static (minus, integer, frac, exp) =>
{
var kind = frac.HasValue || exp.HasValue ? ValueKind.Double : ValueKind.Integer;
var value = minus.Match<string?>(x => x + integer, () => integer);
value = frac.Match(x => value + x, () => value);
value = exp.Match(x => value + x, () => value);
return new Value(kind, value);
},
minus.Optional(),
integer,
frac.Optional(),
exp.Optional());
}
/// <remarks><code>
/// attrExp = (attrPath SP "pr") /
/// (attrPath SP compareOp SP compValue)
/// </code></remarks>
public static Parser<char, AttrExpression> AttrExp
= CreateAttrExp();
private static Parser<char, AttrExpression> CreateAttrExp()
{
var space = Char(' ');
var attrExp = Map(
static (path, opVal) => new AttrExpression(path, opVal.Operation, opVal.Value),
AttrPath.Before(space),
CompareOp.Bind(
op =>
{
return op == Operation.Present
? Not(space).Select(_ => Value.None)
: space.Then(CompValue);
},
(op, value) => (Operation: op, Value: value)));
return attrExp;
}
/// <remarks><code>
/// FILTER = attrExp / logExp / valuePath / *1"not" "(" FILTER ")"
/// </code></remarks>
public static Parser<char, FilterExpression> Filter
= CreateFilter();
private static Parser<char, FilterExpression> CreateFilter()
{
...
}
}
Have a look in the Pidgin.Expression
namespace, that's where I put the stuff to help with recursive grammars.
In your case, I think the code would look something like (untested):
And = String("and")
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>((x, y) => new AndExpression(x, y));
Or = String("or")
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>((x, y) => new OrExpression(x, y));
Filter = ExpressionParser.Build(expr =>
OneOf( // parse a single term
AttrExp,
ValuePath,
String("not").Then(Parenthesised(expr)).Select(x => new NotExpression(x)),
Parenthesised(expr)
),
new[] { Operator.InfixL(And), Operator.InfixL(Or) } // define operators to combine terms
);
(I more or less copied this from the example, I know the docs for ExpressionParser
are somewhat lacking!) The BNF doesn't have anything to say about precedence or associativity, so I took a punt; if you need to change that you'd adjust the line marked // define operators
.
A couple of general pointers which you may already know:
ExpressionParser
gets around this by parsing a flat list of operators and then re-associating it in a second pass.)Hope this helps!
Thank you very much for your reply. I am quite a newbie when it comes to parsers, so appreciate any input (its been 16 years since I had that compiler course at uni).
What I gather is that it should be possible with Pidgin, so what I will attempt is to do something much simpler like the Pidgin expression sample and try to understand what goes on with that. Your docs have been very useful so far, but if you have time at some point, just adding some inline comments in the expression sample would probably help quite a bit.
Anyway, my current status: Right now, I do have something that doesn't stack overflow (an improvement), but also doesn't work with the and
and or
cases:
private static Parser<char, FilterExpression> CreateFilter()
{
var and = String("and")
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>(
static (left, right) => new AndExpression(left, right));
var or = String("or")
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>(
static (left, right) => new OrExpression(left, right));
var filter = ExpressionParser.Build(expr =>
OneOf( // parse a single term
AttrExp.Cast<FilterExpression>(),
String("not").Then(Parenthesised(expr)).Select<FilterExpression>(static filter => new NotExpression(filter)),
Parenthesised(expr)
),
new[] { InfixL(and), InfixL(or) } // define operators to combine terms
);
return filter;
}
This works for "userName eq \"foo\""
, but not "userName eq \"foo\" and age lt 42"
. In the latter case, it just returns an AttrExpression
for the first part, and not an AndExpression
with both parts.
Note: I have purposely not ValuePath
part of the filter language yet, I want to get logExp
parts added in their first.
Ps. once I get these bits working and finish the rest of the SCIM API implementation, the code will be open sourced, for what its worth.
Does AttrExp
consume trailing whitespace?
I do not think AttrExp
consume trailing whitespace, no. It looks like this:
var space = Char(' ');
var attrExp = Map(
static (path, opVal) => new AttrExpression(path, opVal.Operation, opVal.Value),
AttrPath.Before(space),
CompareOp.Bind(
op =>
{
return op == Operation.Present
? Not(space).Select(_ => Value.None)
: space.Then(CompValue);
},
(op, value) => (Operation: op, Value: value)));
I guess it should not, since there might not be any trailing whitespace, and that would be legal.
The input userName eq \"foo\" and age lt 42
has a whitespace character between the AttrExp
and the and
token, so that's why it's failing to recognise the and
. I generally suggest applying SkipWhitespaces
after each token
Ahh of course. The spaces, SP
, are also specified in the ABNF, so they should be there.
If I change and
and or
to this it works:
var and = String("and").Between(SkipWhitespaces)
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>(
static (left, right) => new AndExpression(left, right));
var or = String("or").Between(SkipWhitespaces)
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>(
static (left, right) => new OrExpression(left, right));
But I am curious why. In the filter definition, AttrExp
comes first in the OneOf
parser, so I would think it tries that first, and that just matches on that. Why does it actually try the Parenthesised
(i'm guessing that is actually the logExp part of the abnf) if the AttrExp matches?
var filter = ExpressionParser.Build(expr =>
OneOf( // parse a single term
AttrExp.Cast<FilterExpression>(),
Parenthesised(expr), // logExp?
String("not").Then(Parenthesised(expr)).Select<FilterExpression>(static filter => new NotExpression(filter))
),
new[] { InfixL(and), InfixL(or) } // define operators to combine terms
);
Sorry, this may have been unclear: expr
is a recursive reference to filter
. The grammar says a FILTER
can contain a parenthesised child FILTER
(optionally preceded by not
):
FILTER = ... / *1"not" "(" FILTER ")"
So that's what I was trying to do with the second two items in the OneOf
. logExp
doesn't have an exact analogue in my code, though I spose you could say that the // define operators
line does the job of logExp
.
Hope this helps!
OK, I figured that expr
was the recursive filter. But I get now that logExp is implicitly supported because and/or is in there.
So I guess ExpressionParser
will try one of the parsers in the OneOf
parser, then attempt to match with one of the operators, and then go again. So that is why the filter parser works for userName eq \"foo\" and age lt 42
.
It definitely helps!
So I think I managed to get everything working.
This is the filter parts I ended up with, if anybody stumbles onto this issue in the future. Any suggestions for improvements are of course welcome.
/// <remarks><code>
/// FILTER = attrExp / logExp / valuePath / *1"not" "(" FILTER ")"
// </code></remarks>
public static Parser<char, FilterExpression> Filter
= CreateFilter();
private static Parser<char, T> Tok<T>(Parser<char, T> token)
=> Try(token).Before(SkipWhitespaces);
private static Parser<char, string> Tok(string token)
=> Tok(String(token));
private static Parser<char, T> Parenthesised<T>(Parser<char, T> parser)
=> parser.Between(Tok(Char('(')), Tok(Char(')')));
private static Parser<char, T> Brackets<T>(Parser<char, T> parser)
=> parser.Between(Tok(Char('[')), Tok(Char(']')));
private static Parser<char, FilterExpression> CreateFilter()
{
var and = Tok("and")
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>(
static (left, right) => new AndExpression(left, right));
var or = Tok("or")
.ThenReturn<Func<FilterExpression, FilterExpression, FilterExpression>>(
static (left, right) => new OrExpression(left, right));
var filter = default(Parser<char, FilterExpression>);
filter = ExpressionParser.Build(expr =>
OneOf( // parse a single term
Tok(Map<char, AttrPath, FilterExpression, FilterExpression>(static (path, filter) => new GroupExpression(path, filter),
Tok(AttrPath),
Brackets(expr))), // valuePath
Tok("not")
.Then(Parenthesised(expr))
.Select<FilterExpression>(static filter => new NotExpression(filter)),
Tok(AttrExp.Cast<FilterExpression>()),
Parenthesised(expr)
),
new[] { InfixL(and), InfixL(or) } // and has higher precedence than or
);
return filter;
}
Thanks again @benjamin-hodgson.
Do you have a sponsor button or similar, that I can use to throw a little coin at you for all the help you have provided?
Oh, that's very kind of you but won't be necessary! I have a full time job; I just do open source work for fun. Hopefully it never becomes time-consuming or stressful enough that I feel like I need to make money for it to be sustainable.
If you're keen to contribute, the best way you could help would be to pick up a bug or feature request from the issues board! For example, you might be interested in contributing some documentation improvements for ExpressionParser
, now that you have some experience with it. (Obviously completely optional, no pressure or expectations from me.)
OK, I appreciate that.
I would love to be able to contribute docs, but I am still at the copy/paste-hack-until-it-works level of understanding, but I'll will keep playing with Pidgin and jump back in if I get comfortable enough.
Thanks again!
I have been trying to implement a parser for the SCIM protocols filter language with Pidgin, but I am now stuck because I think I might need left recursion to finish the job. I hope its my lack of understanding though.
The ABNF description of the filter language is as follows (full spec):
In the above ABNF rules, the "compValue" (comparison value) rule is built on JSON Data Interchange format ABNF rules as specified in [RFC7159], "DIGIT" and "ALPHA" are defined per Appendix B.1 of [RFC5234], and "URI" is defined per Appendix A of [RFC3986].
I have been able to implement parsers for everything up till
FILTER
, which useslogExp
and itself recursively. My attempts so far seems to result in a stack overflow.Is there a way forward with Pidgin I am not seeing, or is this simply a limitation I cannot get around with this particular filter language?
Perhaps I could rewrite the grammar to not be left recursive, but that I fear I may not smart enough to do that. Anyway, if there is a low hanging fruit I am not seeing, I hope somebody can point me in the right direction.
Thanks, Egil.
ps. Consider enabling the Discussions feature on the GitHub repo if you do not want questions like this as an issue.