twitter-archive / twitter-text-js

A JavaScript implementation of Twitter's text processing library
1.21k stars 166 forks source link

Consider generating the regular expressions as part of the build process instead of at runtime #95

Open mathiasbynens opened 11 years ago

mathiasbynens commented 11 years ago

E.g. this:

  var nonLatinHashtagChars = [];
  // Cyrillic
  addCharsToCharClass(nonLatinHashtagChars, 0x0400, 0x04ff); // Cyrillic
  addCharsToCharClass(nonLatinHashtagChars, 0x0500, 0x0527); // Cyrillic Supplement
  addCharsToCharClass(nonLatinHashtagChars, 0x2de0, 0x2dff); // Cyrillic Extended A
  addCharsToCharClass(nonLatinHashtagChars, 0xa640, 0xa69f); // Cyrillic Extended B
  // Hebrew
  addCharsToCharClass(nonLatinHashtagChars, 0x0591, 0x05bf); // Hebrew
  addCharsToCharClass(nonLatinHashtagChars, 0x05c1, 0x05c2);
  addCharsToCharClass(nonLatinHashtagChars, 0x05c4, 0x05c5);
  addCharsToCharClass(nonLatinHashtagChars, 0x05c7, 0x05c7);
  addCharsToCharClass(nonLatinHashtagChars, 0x05d0, 0x05ea);
  addCharsToCharClass(nonLatinHashtagChars, 0x05f0, 0x05f4);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb12, 0xfb28); // Hebrew Presentation Forms
  addCharsToCharClass(nonLatinHashtagChars, 0xfb2a, 0xfb36);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb38, 0xfb3c);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb3e, 0xfb3e);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb40, 0xfb41);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb43, 0xfb44);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb46, 0xfb4f);
  // Arabic
  addCharsToCharClass(nonLatinHashtagChars, 0x0610, 0x061a); // Arabic
  addCharsToCharClass(nonLatinHashtagChars, 0x0620, 0x065f);
  addCharsToCharClass(nonLatinHashtagChars, 0x066e, 0x06d3);
  addCharsToCharClass(nonLatinHashtagChars, 0x06d5, 0x06dc);
  addCharsToCharClass(nonLatinHashtagChars, 0x06de, 0x06e8);
  addCharsToCharClass(nonLatinHashtagChars, 0x06ea, 0x06ef);
  addCharsToCharClass(nonLatinHashtagChars, 0x06fa, 0x06fc);
  addCharsToCharClass(nonLatinHashtagChars, 0x06ff, 0x06ff);
  addCharsToCharClass(nonLatinHashtagChars, 0x0750, 0x077f); // Arabic Supplement
  addCharsToCharClass(nonLatinHashtagChars, 0x08a0, 0x08a0); // Arabic Extended A
  addCharsToCharClass(nonLatinHashtagChars, 0x08a2, 0x08ac);
  addCharsToCharClass(nonLatinHashtagChars, 0x08e4, 0x08fe);
  addCharsToCharClass(nonLatinHashtagChars, 0xfb50, 0xfbb1); // Arabic Pres. Forms A
  addCharsToCharClass(nonLatinHashtagChars, 0xfbd3, 0xfd3d);
  addCharsToCharClass(nonLatinHashtagChars, 0xfd50, 0xfd8f);
  addCharsToCharClass(nonLatinHashtagChars, 0xfd92, 0xfdc7);
  addCharsToCharClass(nonLatinHashtagChars, 0xfdf0, 0xfdfb);
  addCharsToCharClass(nonLatinHashtagChars, 0xfe70, 0xfe74); // Arabic Pres. Forms B
  addCharsToCharClass(nonLatinHashtagChars, 0xfe76, 0xfefc);
  addCharsToCharClass(nonLatinHashtagChars, 0x200c, 0x200c); // Zero-Width Non-Joiner
  // Thai
  addCharsToCharClass(nonLatinHashtagChars, 0x0e01, 0x0e3a);
  addCharsToCharClass(nonLatinHashtagChars, 0x0e40, 0x0e4e);
  // Hangul (Korean)
  addCharsToCharClass(nonLatinHashtagChars, 0x1100, 0x11ff); // Hangul Jamo
  addCharsToCharClass(nonLatinHashtagChars, 0x3130, 0x3185); // Hangul Compatibility Jamo
  addCharsToCharClass(nonLatinHashtagChars, 0xA960, 0xA97F); // Hangul Jamo Extended-A
  addCharsToCharClass(nonLatinHashtagChars, 0xAC00, 0xD7AF); // Hangul Syllables
  addCharsToCharClass(nonLatinHashtagChars, 0xD7B0, 0xD7FF); // Hangul Jamo Extended-B
  addCharsToCharClass(nonLatinHashtagChars, 0xFFA1, 0xFFDC); // half-width Hangul
  // Japanese and Chinese
  addCharsToCharClass(nonLatinHashtagChars, 0x30A1, 0x30FA); // Katakana (full-width)
  addCharsToCharClass(nonLatinHashtagChars, 0x30FC, 0x30FE); // Katakana Chouon and iteration marks (full-width)
  addCharsToCharClass(nonLatinHashtagChars, 0xFF66, 0xFF9F); // Katakana (half-width)
  addCharsToCharClass(nonLatinHashtagChars, 0xFF70, 0xFF70); // Katakana Chouon (half-width)
  addCharsToCharClass(nonLatinHashtagChars, 0xFF10, 0xFF19); // \
  addCharsToCharClass(nonLatinHashtagChars, 0xFF21, 0xFF3A); //  - Latin (full-width)
  addCharsToCharClass(nonLatinHashtagChars, 0xFF41, 0xFF5A); // /
  addCharsToCharClass(nonLatinHashtagChars, 0x3041, 0x3096); // Hiragana
  addCharsToCharClass(nonLatinHashtagChars, 0x3099, 0x309E); // Hiragana voicing and iteration mark
  addCharsToCharClass(nonLatinHashtagChars, 0x3400, 0x4DBF); // Kanji (CJK Extension A)
  addCharsToCharClass(nonLatinHashtagChars, 0x4E00, 0x9FFF); // Kanji (Unified)
  // -- Disabled as it breaks the Regex.
  //addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B)
  addCharsToCharClass(nonLatinHashtagChars, 0x2A700, 0x2B73F); // Kanji (CJK Extension C)
  addCharsToCharClass(nonLatinHashtagChars, 0x2B740, 0x2B81F); // Kanji (CJK Extension D)
  addCharsToCharClass(nonLatinHashtagChars, 0x2F800, 0x2FA1F); // Kanji (CJK supplement)
  addCharsToCharClass(nonLatinHashtagChars, 0x3003, 0x3003); // Kanji iteration mark
  addCharsToCharClass(nonLatinHashtagChars, 0x3005, 0x3005); // Kanji iteration mark
  addCharsToCharClass(nonLatinHashtagChars, 0x303B, 0x303B); // Han iteration mark

  twttr.txt.regexen.nonLatinHashtagChars = regexSupplant(nonLatinHashtagChars.join(""));

With Regenerate, this could become:

var nonLatinHashtagChars = regenerate()
  // Cyrillic
  .addRange(0x0400, 0x04FF) // Cyrillic
  .addRange(0x0500, 0x0527) // Cyrillic Supplement
  .addRange(0x2DE0, 0x2DFF) // Cyrillic Extended A
  .addRange(0xA640, 0xA69F) // Cyrillic Extended B
  // Hebrew
  .addRange(0x0591, 0x05BF) // Hebrew
  .addRange(0x05C1, 0x05C2)
  .addRange(0x05C4, 0x05C5)
  .add(0x05c7)
  .addRange(0x05D0, 0x05EA)
  .addRange(0x05F0, 0x05F4)
  .addRange(0xFB12, 0xFB28) // Hebrew Presentation Forms
  .addRange(0xFB2A, 0xFB36)
  .addRange(0xFB38, 0xFB3C)
  .addRange(0xFB3E, 0xFB3E)
  .addRange(0xFB40, 0xFB41)
  .addRange(0xFB43, 0xFB44)
  .addRange(0xFB46, 0xFB4F)
  // Arabic
  .addRange(0x0610, 0x061A) // Arabic
  .addRange(0x0620, 0x065F)
  .addRange(0x066E, 0x06D3)
  .addRange(0x06D5, 0x06DC)
  .addRange(0x06DE, 0x06E8)
  .addRange(0x06EA, 0x06EF)
  .addRange(0x06FA, 0x06FC)
  .addRange(0x06FF, 0x06FF)
  .addRange(0x0750, 0x077F) // Arabic Supplement
  .addRange(0x08A0, 0x08A0) // Arabic Extended A
  .addRange(0x08A2, 0x08AC)
  .addRange(0x08E4, 0x08FE)
  .addRange(0xFB50, 0xFBB1) // Arabic Pres. Forms A
  .addRange(0xFBD3, 0xFD3D)
  .addRange(0xFD50, 0xFD8F)
  .addRange(0xFD92, 0xFDC7)
  .addRange(0xFDF0, 0xFDFB)
  .addRange(0xFE70, 0xFE74) // Arabic Pres. Forms B
  .addRange(0xFE76, 0xFEFC)
  .addRange(0x200C, 0x200C) // Zero-Width Non-Joiner
  // Thai
  .addRange(0x0E01, 0x0E3A)
  .addRange(0x0E40, 0x0E4E)
  // Hangul (Korean)
  .addRange(0x1100, 0x11FF) // Hangul Jamo
  .addRange(0x3130, 0x3185) // Hangul Compatibility Jamo
  .addRange(0xA960, 0xA97F) // Hangul Jamo Extended-A
  .addRange(0xAC00, 0xD7AF) // Hangul Syllables
  .addRange(0xD7B0, 0xD7FF) // Hangul Jamo Extended-B
  .addRange(0xFFA1, 0xFFDC) // half-width Hangul
  // Japanese and Chinese
  .addRange(0x30A1, 0x30FA) // Katakana (full-width)
  .addRange(0x30FC, 0x30FE) // Katakana Chouon and iteration marks (full-width)
  .addRange(0xFF66, 0xFF9F) // Katakana (half-width)
  .add(0xFF70) // Katakana Chouon (half-width)
  .addRange(0xFF10, 0xFF19) // \
  .addRange(0xFF21, 0xFF3A) //  - Latin (full-width)
  .addRange(0xFF41, 0xFF5A) // /
  .addRange(0x3041, 0x3096) // Hiragana
  .addRange(0x3099, 0x309E) // Hiragana voicing and iteration mark
  .addRange(0x3400, 0x4DBF) // Kanji (CJK Extension A)
  .addRange(0x4E00, 0x9FFF) // Kanji (Unified)
  .addRange(0x20000, 0x2A6DF) // Kanji (CJK Extension B)
  .addRange(0x2A700, 0x2B73F) // Kanji (CJK Extension C)
  .addRange(0x2B740, 0x2B81F) // Kanji (CJK Extension D)
  .addRange(0x2F800, 0x2FA1F) // Kanji (CJK supplement)
  .add(0x3003) // Kanji iteration mark
  .add(0x3005) // Kanji iteration mark
  .add(0x303B); // Han iteration mark

twttr.txt.regexen.nonLatinHashtagChars = nonLatinHashtagChars.toRegExp();

But it would be even better to not do it at runtime, but as part of a build process:

nonLatinHashtagChars.toString();
// returns a string literal that can be injected into a JS file as part of a regular expression literal
// '[\\u0400-\\u0527\\u0591-\\u05BF\\u05C1-\\u05C2\\u05C4-\\u05C5\\u05C7\\u05D0-\\u05EA\\u05F0-\\u05F4\\u0610-\\u061A\\u0620-\\u065F\\u066E-\\u06D3\\u06D5-\\u06DC\\u06DE-\\u06E8\\u06EA-\\u06EF\\u06FA-\\u06FC\\u06FF\\u0750-\\u077F\\u08A0\\u08A2-\\u08AC\\u08E4-\\u08FE\\u0E01-\\u0E3A\\u0E40-\\u0E4E\\u1100-\\u11FF\\u200C\\u2DE0-\\u2DFF\\u3003\\u3005\\u303B\\u3041-\\u3096\\u3099-\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u3130-\\u3185\\u3400-\\u4DBF\\u4E00-\\u9FFF\\uA640-\\uA69F\\uA960-\\uA97F\\uAC00-\\uD7FF\\uFB12-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40-\\uFB41\\uFB43-\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74\\uFE76-\\uFEFC\\uFF10-\\uFF19\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFF66-\\uFF9F\\uFFA1-\\uFFDC]|[\\uD840-\\uD868\\uD86A-\\uD86D][\\uDC00-\\uDFFF]|\\uD869[\\uDC00-\\uDEDF\\uDF00-\\uDFFF]|\\uD86E[\\uDC00-\\uDC1F]|\\uD87E[\\uDC00-\\uDE1F]'

This way, the source code (before building) is still very readable/maintainable, but the built code is optimized for run-time performance.

Note that using Regenerate would also solve this problem with astral symbols:

  // -- Disabled as it breaks the Regex.
  //addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B)

Would you be interested in a pull request that ports all the regular expressions to Regenerate + adds a simple build script?

jakl commented 11 years ago

Sounds great - better performance, low impact, and compatible between node and the browser.

mathiasbynens commented 11 years ago

@jakl What kind of patch would you prefer?

I could create a new tools directory and add a quick generate-regexes.js file there that generates the regular expressions and writes their source to separate files (in a new data directory), for example.

The next step would be to tweak the build script so it automatically inserts the contents of those files in the right places in the source code. I generally use grunt-template for that, but this project is using a Rakefile so I can imagine you’d rather not introduce another “build script” layer.

kof commented 10 years ago

+1

mathiasbynens commented 10 years ago

Also I would be interested to hear where these ranges come from. Is this just a listing of all code points in a given Unicode category/script/block/…? Cause that would make things even easier.