martok / palefill

Inject Polyfills for various web technologies into pages requiring them
https://martok.github.io/palefill/
Mozilla Public License 2.0
79 stars 9 forks source link

RegExp named capturing groups #42

Closed SeaHOH closed 1 year ago

SeaHOH commented 2 years ago

They have started using this feature. e.g. https://github.com/orgs/opensearch-project/projects/30

Related scripts: https://github.githubassets.com/assets/vendors-node_modules_memex-frontend-latest_dist_client_app_bundle_js-e03fc02fea42.js https://github.githubassets.com/assets/chunk-app_assets_modules_github_insights_insights-query_ts-0b114b80f914.js

Only blank pages here. I think we need polyfill until Moonchild add this feature, and I finished it.

1. The polyfill (click) ```javascript /* ============================================================================= RegExp Named Capturing Groups polyfill ============================================================================= Only work with RegExp(...), polyfill with literal notation is impossible. working: // /Foo(?bar)/i RegExp("Foo(?bar)", "i") RegExp(RegExp("Foo(?bar)"), "i") not working: // SyntaxError: invalid regexp group /Foo(?bar)/i RegExp(/Foo(?bar)/, "i") ============================================================================= The MIT License (MIT) ============================================================================= Copyright (c) 2017- Commenthol https://github.com/commenthol/named-regexp-groups Copyright (c) 2017- lifaon74 https://github.com/lifaon74/regexp-polyfill Copyright (c) 2022- SeaHOH https://github.com/SeaHOH Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ (function () { try { new RegExp("(?foo)"); return } catch(e) { } const S_NAME = "([a-zA-Z_$][a-zA-Z_$0-9]{0,50})", R_NAME_REPLACE = new RegExp("\\$<" + S_NAME + ">", "g"), R_NAMED_BACKREF = new RegExp("^[?:]&" + S_NAME), R_GROUP = new RegExp("^[?:]<" + S_NAME + ">([^]*)"), R_GROUPS = /(\\?[()])/g, R_EMPTY_GROUPS = /([^\\]|^)\(\)/g, A_FLAGS = Object.values("dgimsuy"), dotAllBroken = (() => { try { new RegExp("", "s") } catch(e) { return true } return false; })(); function generate (str) { const groups = {}, named = {}, arr = String(str).split(R_GROUPS), store = { count: 0, // counter for unnamed matching group groups: [""], // store for named pattern names: [] // store for names of capture groups }; let index = 0, source = arr.map((part, i) => { let name, block, isGroup = false; switch (part) { case "(": store.groups.push(""); store.names.push(""); break case ")": block = store.groups.pop(); name = store.names.pop(); /* istanbul ignore else */ if (name) { named[name] = block.substr(1); } break default: // is it a real group, not a cluster (?:...), or assertion (?=...), (?!...) isGroup = arr[i - 1] === "(" && !/^\?[:!=]/.test(part); if (isGroup) { index ++; // named capture group check name = R_GROUP.exec(part); if (name && name[1]) { if (!groups[name[1]]) { store.names[store.names.length - 1] = name[1]; groups[name[1]] = index; } else { store.count ++; } part = name[2] || ""; if (arr[i + 1] === ")" && !name[2]) { part = "[^]+"; } } else { // is not a cluster, assertion or named capture group store.count ++; } // named backreference check name = R_NAMED_BACKREF.exec(part); if (name && name[1]) { part = named[name[1]] || ""; } } break } store.groups = store.groups.map((group) => { return (group + part); }); return part; }).join("") .replace(R_EMPTY_GROUPS, "$1"); // remove any empty groups return { source, groups, named }; } NativeRegExp = RegExp; NamedRegExp = class NamedRegExp extends NativeRegExp { constructor(pattern, flags) { if (pattern instanceof NamedRegExp) { pattern = pattern.source; flags = flags || pattern.flags; } flags = flags || ""; const cflags = flags.replace("s", ""), dotall = cflags !== flags; if (!(dotall && dotAllBroken) && pattern instanceof NativeRegExp) return new NativeRegExp(pattern, flags); let { source, groups } = generate(pattern); const named = Object.keys(groups).length > 0; if (dotall && dotAllBroken) { // Fix flag "s" in RegExp(...).constructor source = source.replace(/([^\\]|^)\./g, "$1[\\s\\S]"); } else if (!named) return new NativeRegExp(pattern, flags); super(source, cflags); this._source = pattern; this._dotall = dotall; if (named) this._groups = groups; this._flags = A_FLAGS.map((flag) => { return flags.includes(flag) ? flag : ""; }).join(""); } get source() { return this._source; } get dotAll() { return this._dotall; } get flags() { return this._flags; } _updateGroups(res) { if (res && this._groups) { res.groups = {}; Object.entries(this._groups).forEach(([name, index]) => { res.groups[name] = res[index]; }) return res.groups; } } exec(str) { const res = super.exec(str); this._updateGroups(res); return res; } [Symbol.replace](str, replacement) { const groups = this._groups, repl = groups ? (typeof replacement === "function") ? ((...args) => { args.push(this._updateGroups(args)); return replacement(...args); }) : String(replacement).replace(R_NAME_REPLACE, (_, name) => { const index = groups[name]; return [undefined, null].includes(index) ? "" : "$" + index; }) : replacement; return super[Symbol.replace](str, repl); } } RegExp = function RegExp(pattern, flags) { return new NamedRegExp(pattern, flags); } RegExp.prototype = NativeRegExp.prototype; }()); ```
2. The babel polyfill (click) ```javascript function namedGroupsFill(match, splitS, pattern, flags, splitE) { return `${splitS}RegExp("${pattern.replace(/[\\"]/g, "\\$&")}","${flags}")${splitE}` } const namedGroupsSub = [/([=(])\/((?:[^/]|\\\/)*?\(\?<(?:[^/]|\\\/)+)\/([dgimsuy]*)([,;)])/g, namedGroupsFill]; /* /vendors-node_modules_memex-frontend-latest_dist_client_app_bundle_js-*.js /chunk-app_assets_modules_github_insights_insights-query_ts-*.js NAMED_CAPTURING_GROUPS_DONE = NAMED_CAPTURING_GROUPS.replace(...namedGroupsSub); */ ```
martok commented 2 years ago

I had looked into using XRegExp before, but scrapped that idea because I couldn't get a robust method to replace literal expressions.

const namedGroupsSub = [/([=(])\/((?:[^/]|\\\/)*?\(\?<(?:[^/]|\\\/)+)\/([dgimsuy]*)([,;)])/g, namedGroupsFill];
*/

If that expression works reliably, it could be used to rewrite all Regex to XRegExp, which would also give us support for Unicode categories (\p{FOO}) and a few other things on top of your polyfill.

SeaHOH commented 2 years ago

XRegExp looks powerful, but seems unsuitable for polyfill, it's a new style, not just a extended.

martok commented 2 years ago

Not really, they also just extend the RegExp objects. All the other things are optional... but even minified, the simplest useful version compiles to ~200kB, that's a lot.

I'd much rather take yours then, but to be able to get rid of all handcrafted replacements I'm going to throw it at a whole bunch of test cases before integration.

SeaHOH commented 2 years ago

But I saw below description in the doc, they have to do more work for ECMA compatibility, if provide polyfill support.

// Using named capture and flag x for free-spacing and line comments
const date = XRegExp(
    `(?<year>  [0-9]{4} ) -?  # year
     (?<month> [0-9]{2} ) -?  # month
     (?<day>   [0-9]{2} )     # day`, 'x');

// XRegExps compile to RegExps and work with native methods
// However, named captures must be referenced using numbered backreferences
// if used with native methods
'2021-02-22'.replace(date, '$2/$3/$1');
// -> '02/22/2021'
martok commented 2 years ago

Quick update: the last two weekends were spent on writing a parser-based polyfill (temporary repo, force-pushed quite often) based on the code here and a new expression scanner. The idea is to have it robust enough that we can just throw it at all JS and it figures out what needs replacement automatically. Luckily, regular expressions are regular (🤭), so a scanner even for complicated expressions is fairly straightforward (at least until a tc39 proposal lands that makes character classes a language of their own...).

This source compiles to two parts, a runtime segment that gets inserted as a polyfill and a host segment running in Palefill that replaces /foo/ literals with Regexp("foo") calls. Terser's dead code elimination based conditional compiliation is pretty neat.

Preliminary performance impact is about 10ms/1MB source for NamedRegExpShim.replaceRegExpLiterals and <=2ms per expression for the runtime transpiling part. With time-to-usable-interface for many "modern" sites upward of 10s, the added time is insignificant.

Downside: it is currently a bit over-eager and really only works with minified source due to context limiting issues. I have an idea how to solve that on the Palefill side, that'll be the next thing before final integration.

SeaHOH commented 2 years ago

There are some bugs:

// my polyfill or firefox RegExp("foo", "s"); // --> /foo/s

// both OK RegExp("foo.", "s"); // --> /foo./s


- `Symbol.replace`
```javascript
// temporary repo
"foo bar".replace(RegExp("foo."), "($<foo>)");                    // --> TypeError: groups is undefined

// temporary repo and my polyfill (early version)
"foo bar".replace(RegExp("foo.", "s"), "($<foo>)");               // --> "()bar"
"foo bar".replace(RegExp("(?<foo>foo).", "s"), ["($<foo>)", ""]); // --> "($<foo>),"

// my polyfill (current version) or firefox
"foo bar".replace(RegExp("foo."), "($<foo>)");                    // --> "($<foo>)bar"
"foo bar".replace(RegExp("foo.", "s"), "($<foo>)");               // --> "($<foo>)bar"
"foo bar".replace(RegExp("(?<foo>foo).", "s"), ["($<foo>)", ""]); // --> "(foo),bar"

// both OK
"foo bar".replace(RegExp("(?<foo>foo).", "s"), "($<foo>)");       // --> "(foo)bar"
SeaHOH commented 2 years ago

And now, I am aware of that use Object.defineProperty in the constructor is better instead of define get function in class NamedRegExp.

martok commented 1 year ago

Resolved by implementing full support in PM 32.0 / UXP 6.0