mathiasbynens / he

A robust HTML entity encoder/decoder written in JavaScript.
https://mths.be/he
MIT License
3.45k stars 254 forks source link

Handle lone surrogates as per the spec + implement lookup table #4

Closed mathiasbynens closed 11 years ago

mathiasbynens commented 11 years ago

From http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references:

Otherwise, if the number is in the range 0xD800 to 0xDFFF or is greater than 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER.

Some examples:

he.decode('��') → '\uFFFD\uFFFD'
he.decode('�') → '\uFFFD'

Also check out the table in the spec, e.g.:

he.decode('�') → '\uFFFD'

Reported by the amazing @zcorpan in #whatwg.

mathiasbynens commented 11 years ago

To get the table data as an object, open http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references and run the following script from the console:

var elements = document.querySelectorAll('td');
var keys = [].filter.call(elements, function(element, index) {
    return index % 3 == 0;
}).map(function(element) {
    return Number(element.innerText.trim());
});
var values = [].filter.call(elements, function(element, index) {
    return index % 3 == 1;
}).map(function(element) {
    return element.innerText.trim().replace('U+', '\\u');
});
var object = {};
keys.forEach(function(key, index) {
    object[key] = values[index];
});
copy(
    JSON.stringify(object, null, '\t')
        .replace(/\\\\/g, '\\')
        .replace(/\\u00/g, '\\x')
        .replace(/"/g, '\'')
);

Result:

{
    '0': '\uFFFD',
    '13': '\x0D',
    '128': '\u20AC',
    '129': '\x81',
    '130': '\u201A',
    '131': '\u0192',
    '132': '\u201E',
    '133': '\u2026',
    '134': '\u2020',
    '135': '\u2021',
    '136': '\u02C6',
    '137': '\u2030',
    '138': '\u0160',
    '139': '\u2039',
    '140': '\u0152',
    '141': '\x8D',
    '142': '\u017D',
    '143': '\x8F',
    '144': '\x90',
    '145': '\u2018',
    '146': '\u2019',
    '147': '\u201C',
    '148': '\u201D',
    '149': '\u2022',
    '150': '\u2013',
    '151': '\u2014',
    '152': '\u02DC',
    '153': '\u2122',
    '154': '\u0161',
    '155': '\u203A',
    '156': '\u0153',
    '157': '\x9D',
    '158': '\u017E',
    '159': '\u0178'
}