EdwardZZZ / articles

工作点滴记录
2 stars 0 forks source link

bytesToString #50

Open EdwardZZZ opened 5 years ago

EdwardZZZ commented 5 years ago
const str = '大类Zzのab123😁';
const bytes = Buffer.from(str);

function bytesToString(bytes) {
    let strArr = [];
    for (let i = 0; i < bytes.length; i++) {
        const binary = bytes[i].toString(2);
        const r = binary.match(/^(1+)0/);
        if (r && binary.length === 8) {
            let len = r[1].length;
            let code = bytes[i] & (len === 4 ? 0xf : 0x1f);
            while (--len) {
                code = (code << 6) | (bytes[++i] & 0x3f);
            }
            strArr.push(String.fromCodePoint(code));
        } else {
            strArr.push(String.fromCharCode(bytes[i]));
        }
    }
    return strArr.join('');
}

function stringToBytes(str) {
    const bytes = [];
    for (let w of str) {
        // const utf16Reg = /[\ud800-\udbff][\udc00-\udfff]/;
        // if (w.length === 2) {
        //     const b1 = str.charCodeAt(0);
        //     const b2 = str.charCodeAt(1);
        //     const c = (((b1 & ~0xd800) << 10) | (b2 & ~0xdc00)) + 0x10000;
        // }

        const c = w.codePointAt();

        if (c <= 0x7f) {
            bytes.push(c);
        } else if (c < 0x7ff) {
            bytes.push(c >> 6 | 0xc0);
            bytes.push(c & 0x3f | 0x80);
        } else if (c < 0xffff) {
            bytes.push(c >> 12 | 0xe0);
            bytes.push(c >> 6 & 0x3f | 0x80);
            bytes.push(c & 0x3f | 0x80);
        } else {
            bytes.push(c >> 18 | 0xf0);
            bytes.push(c >> 12 & 0x3f | 0x80);
            bytes.push(c >> 6 & 0x3f | 0x80);
            bytes.push(c & 0x3f | 0x80);
        }
    }

    return bytes;
}

console.log(bytesToString(bytes));
console.log(stringToBytes(str).join(',') === Buffer.from(str).join(','));
EdwardZZZ commented 5 years ago

Node.js 中 Buffer 采用 Uint 不需要将超过128的换算为负数

EdwardZZZ commented 5 years ago
const str = '😁';
function UTF16(str) {
    if (str.length !== 2) return String.charCodeAt(0);

    const b1 = str.charCodeAt(0);
    const b2 = str.charCodeAt(1);

    const n1 = b1 & ~0xd800;
    const n2 = b2 & ~0xdc00;

    const n = (n1 << 10) | n2;
    return n + 0x10000;
}

String.fromCodePoint(UTF16(str));
EdwardZZZ commented 4 years ago
function UTF8(code) {
    if (code < 0x10000) return code;

    const n = code - 0x10000;

    const n1 = n >> 10;
    const n2 = n & 0x3ff;

    const b1 = n1 | 0xd800;
    const b2 = n2 | 0xdc00;

    return [b1, b2];
}

const codes = UTF8(UTF16(str));
console.log(String.fromCodePoint(...codes))
EdwardZZZ commented 3 years ago

泰语字符串长度

function strThaiLen(str) {
    let len = 0;
    let i = 0;

    for (let w of str) {
        const [ch0, ch1, ch2] = Buffer.from(w);

        if (ch0 === 0xE0) {
            if ((ch1 == 0xb8 && (ch2 == 0xb1 || (ch2 >= 0xb3 && ch2 <= 0xba))) || (ch1 == 0xb9 && ch2 >= 0x87 && ch2 <= 0x8e)) {
            } else {
                len++;
            }
        } else {
            len++;
        }

        if (i === str.length) break;
    }

    return len;
}

https://unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table https://github.com/orling/grapheme-splitter