Closed gkellogg closed 1 year ago
The following grammar parses correctly, pulling in productions from RFC3987, which could serve as a starting point:
IRI = scheme ":" ihier-part [ "?" iquery ]
[ "#" ifragment ]
ihier-part = "//" iauthority ipath-abempty
/ ipath-absolute
/ ipath-rootless
/ ipath-empty
IRI-reference = IRI / irelative-ref
absolute-IRI = scheme ":" ihier-part [ "?" iquery ]
irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ]
irelative-part = "//" iauthority ipath-abempty
/ ipath-absolute
/ ipath-noscheme
/ ipath-empty
iauthority = [ iuserinfo "@" ] ihost [ ":" port ]
iuserinfo = *( iunreserved / pct-encoded / sub-delims / ":" )
ihost = IP-literal / IPv4address / ireg-name
ireg-name = *( iunreserved / pct-encoded / sub-delims )
ipath = ipath-abempty ; begins with "/" or is empty
/ ipath-absolute ; begins with "/" but not "//"
/ ipath-noscheme ; begins with a non-colon segment
/ ipath-rootless ; begins with a segment
/ ipath-empty ; zero characters
ipath-abempty = *( "/" isegment )
ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ]
ipath-noscheme = isegment-nz-nc *( "/" isegment )
ipath-rootless = isegment-nz *( "/" isegment )
ipath-empty = 0<ipchar>
isegment = *ipchar
isegment-nz = 1*ipchar
isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
/ "@" )
; non-zero-length segment without any colon ":"
ipchar = iunreserved / pct-encoded / sub-delims / ":"
/ "@"
iquery = *( ipchar / iprivate / "/" / "?" )
ifragment = *( ipchar / "/" / "?" )
iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
/ %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
/ %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
/ %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
/ %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
/ %xD0000-DFFFD / %xE1000-EFFFD
iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
port = *DIGIT
pct-encoded = "%" HEXDIG HEXDIG
IP-literal = "[" ( IPv6address / IPvFuture ) "]"
IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
IPv6address = 6( h16 ":" ) ls32
/ "::" 5( h16 ":" ) ls32
/ [ h16 ] "::" 4( h16 ":" ) ls32
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
/ [ *4( h16 ":" ) h16 ] "::" ls32
/ [ *5( h16 ":" ) h16 ] "::" h16
/ [ *6( h16 ":" ) h16 ] "::"
h16 = 1*4HEXDIG
IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
dec-octet = DIGIT ; 0-9
/ %x31-39 DIGIT ; 10-99
/ "1" 2DIGIT ; 100-199
/ "2" %x30-34 DIGIT ; 200-249
/ "25" %x30-35 ; 250-255
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="
The HTML production needs some work, but I get the following:
IRI |
::= | scheme ": " ihier-part ( "? " iquery) ? ( "# " ifragment) ? |
ihier-part |
::= | ( "//" iauthority ipath-abempty) | ipath-absolute | ipath-rootless | ipath-empty |
IRI-reference |
::= | IRI | irelative-ref |
absolute-IRI |
::= | scheme ": " ihier-part ( "? " iquery) ? |
irelative-ref |
::= | irelative-part ( "? " iquery) ? ( "# " ifragment) ? |
irelative-part |
::= | ( "//" iauthority ipath-abempty) | ipath-absolute | ipath-noscheme | ipath-empty |
iauthority |
::= | ( iuserinfo "@ ") ? ihost ( ": " port) ? |
iuserinfo |
::= | ( iunreserved | pct-encoded | sub-delims | ": ") * |
ihost |
::= | IP-literal | IPv4address | ireg-name |
ireg-name |
::= | ( iunreserved | pct-encoded | sub-delims) * |
ipath |
::= | ipath-abempty | ipath-absolute | ipath-noscheme | ipath-rootless | ipath-empty |
ipath-abempty |
::= | ( "/ " isegment) * |
ipath-absolute |
::= | "/ " ( isegment-nz ( "/ " isegment) * ) ? |
ipath-noscheme |
::= | isegment-nz-nc ( "/ " isegment) * |
ipath-rootless |
::= | isegment-nz ( "/ " isegment) * |
ipath-empty |
::= | |
isegment |
::= | ipchar* |
isegment-nz |
::= | ipchar+ |
isegment-nz-nc |
::= | ( iunreserved | pct-encoded | sub-delims | "@ ") + |
ipchar |
::= | iunreserved | pct-encoded | sub-delims | ": " | "@ " |
iquery |
::= | ( ipchar | iprivate | "/ " | "? ") * |
ifragment |
::= | ( ipchar | "/ " | "? ") * |
iunreserved |
::= | ALPHA | DIGIT | "- " | ". " | "_ " | "~ " | ucschar |
ucschar |
::= | [ #xA0 - #xD7FF ] |
| | [ #xF900 - #xFDCF ] |
|
| | [ #xFDF0 - #xFFEF ] |
|
| | [ #x00010000 - #x0001FFFD ] |
|
| | [ #x00020000 - #x0002FFFD ] |
|
| | [ #x00030000 - #x0003FFFD ] |
|
| | [ #x00040000 - #x0004FFFD ] |
|
| | [ #x00050000 - #x0005FFFD ] |
|
| | [ #x00060000 - #x0006FFFD ] |
|
| | [ #x00070000 - #x0007FFFD ] |
|
| | [ #x00080000 - #x0008FFFD ] |
|
| | [ #x00090000 - #x0009FFFD ] |
|
| | [ #x000A0000 - #x000AFFFD ] |
|
| | [ #x000B0000 - #x000BFFFD ] |
|
| | [ #x000C0000 - #x000CFFFD ] |
|
| | [ #x000D0000 - #x000DFFFD ] |
|
| | [ #x000E1000 - #x000EFFFD ] |
|
iprivate |
::= | [ #xE000 - #xF8FF ] | [ #x000F0000 - #x000FFFFD ] | [ #x00100000 - #x0010FFFD ] |
scheme |
::= | ALPHA ( ALPHA | DIGIT | "+ " | "- " | ". ") * |
port |
::= | DIGIT* |
pct-encoded |
::= | "% " HEXDIG HEXDIG |
IP-literal |
::= | "[ " ( IPv6address | IPvFuture) "] " |
IPvFuture |
::= | "v " HEXDIG+ ". " ( unreserved | sub-delims | ": ") + |
IPv6address |
::= | ( ( ( h16 ": ") ( h16 ": ") ( h16 ": ") ( h16 ": ") ( h16 ": ") ( h16 ": ") ) ls32) |
| | ( "::" ( ( h16 ": ") ( h16 ": ") ( h16 ": ") ( h16 ": ") ( h16 ": ") ) ls32) |
|
| | ( h16? "::" ( ( h16 ": ") ( h16 ": ") ( h16 ": ") ( h16 ": ") ) ls32) |
|
| | ( ( ( h16 ": ") ? h16) ? "::" ( ( h16 ": ") ( h16 ": ") ( h16 ": ") ) ls32) |
|
| | ( ( ( ( ( h16 ": ") ( h16 ": ") ? ) ? ) h16) ? "::" ( ( h16 ": ") ( h16 ": ") ) ls32) |
|
| | ( ( ( ( ( h16 ": ") ( ( h16 ": ") ( h16 ": ") ? ) ? ) ? ) h16) ? "::" h16 ": " ls32) |
|
| | ( ( ( ( ( h16 ": ") ( ( h16 ": ") ( ( h16 ": ") ( h16 ": ") ? ) ? ) ? ) ? ) h16) ? "::" ls32) |
|
| | ( ( ( ( ( h16 ": ") ( ( h16 ": ") ( ( h16 ": ") ( ( h16 ": ") ( h16 ": ") ? ) ? ) ? ) ? ) ? ) h16) ? "::" h16) |
|
| | ( ( ( ( ( h16 ": ") ( ( h16 ": ") ( ( h16 ": ") ( ( h16 ": ") ( ( h16 ": ") ( h16 ": ") ? ) ? ) ? ) ? ) ? ) ? ) h16) ? "::") |
|
h16 |
::= | HEXDIG ( HEXDIG ( HEXDIG HEXDIG? ) ? ) ? |
IPv4address |
::= | dec-octet ". " dec-octet ". " dec-octet ". " dec-octet |
dec-octet |
::= | DIGIT | ( [ #x31 - #x39 ] DIGIT) | ( "1 " ( DIGIT DIGIT) ) | ( "2 " [ #x30 - #x34 ] DIGIT) | ( "25" [ #x30 - #x35 ] ) |
unreserved |
::= | ALPHA | DIGIT | "- " | ". " | "_ " | "~ " |
reserved |
::= | gen-delims | sub-delims |
gen-delims |
::= | ": " | "/ " | "? " | "# " | "[ " | "] " | "@ " |
sub-delims |
::= | "! " | "$ " | "& " | "' " | "( " | ") " | "* " | "+ " | ", " | "; " | "= " |
ALPHA |
::= | [ #x41 - #x5A #x61 - #x7A ] |
DIGIT |
::= | [ #x30 - #x39 ] |
HEXDIG |
::= | DIGIT | [ A-F ] |
Probably just need to update some of the productions by hand, unless there's a better tool out there for generating the HTML.
I suggest keeping the rule order of 3986.
IRI = scheme ":" ihier-part [ "?" iquery ] [ "#" ifragment ]
ihier-part = "//" iauthority ipath-abempty
/ ipath-absolute
/ ipath-rootless
/ ipath-empty
IRI-reference = IRI / irelative-ref
absolute-IRI = scheme ":" ihier-part [ "?" iquery ]
irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ]
irelative-part = "//" iauthority ipath-abempty
/ ipath-absolute
/ ipath-noscheme
/ ipath-empty
scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
iauthority = [ iuserinfo "@" ] ihost [ ":" port ]
iuserinfo = *( iunreserved / pct-encoded / sub-delims / ":" )
ihost = IP-literal / IPv4address / ireg-name
port = *DIGIT
IP-literal = "[" ( IPv6address / IPvFuture ) "]"
IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
IPv6address = 6( h16 ":" ) ls32
/ "::" 5( h16 ":" ) ls32
/ [ h16 ] "::" 4( h16 ":" ) ls32
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
/ [ *4( h16 ":" ) h16 ] "::" ls32
/ [ *5( h16 ":" ) h16 ] "::" h16
/ [ *6( h16 ":" ) h16 ] "::"
h16 = 1*4HEXDIG
ls32 = ( h16 ":" h16 ) / IPv4address
IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
dec-octet = DIGIT ; 0-9
/ %x31-39 DIGIT ; 10-99
/ "1" 2DIGIT ; 100-199
/ "2" %x30-34 DIGIT ; 200-249
/ "25" %x30-35 ; 250-255
ireg-name = *( iunreserved / pct-encoded / sub-delims )
ipath = ipath-abempty ; begins with "/" or is empty
/ ipath-absolute ; begins with "/" but not "//"
/ ipath-noscheme ; begins with a non-colon segment
/ ipath-rootless ; begins with a segment
/ ipath-empty ; zero characters
ipath-abempty = *( "/" isegment )
ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ]
ipath-noscheme = isegment-nz-nc *( "/" isegment )
ipath-rootless = isegment-nz *( "/" isegment )
ipath-empty = 0<ipchar>
isegment = *ipchar
isegment-nz = 1*ipchar
isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims / "@" )
; non-zero-length segment without any colon ":"
ipchar = iunreserved / pct-encoded / sub-delims / ":" / "@"
iquery = *( ipchar / iprivate / "/" / "?" )
ifragment = *( ipchar / "/" / "?" )
iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
/ %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
/ %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
/ %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
/ %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
/ %xD0000-DFFFD / %xE1000-EFFFD
iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
pct-encoded = "%" HEXDIG HEXDIG
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="
HEXDIG, DIGIT, ALPHA are from the ABNF spec "Core Rules".
HEXDIG, DIGIT, ALPHA are from the ABNF spec "Core Rules".
That's an artifact of the serializer, which needs some more updates. I'll re-generate for a PR.
As long as we are clear where they come from (i.e. not us!).
Hmm - "2.1 Percent-Encoding" says:
The uppercase hexadecimal digits 'A' through 'F' are equivalent to the lowercase digits 'a' through 'f', respectively. If two URIs differ only in the case of hexadecimal digits used in percent-encoded octets, they are equivalent.
but HEXDIG is upper case only. It is the "Case Normalization" step of the Comparison Ladder that adjusts them.
See https://github.com/w3c/rdf-concepts/issues/15#issuecomment-1436962088 for the discussion leading to this issue.