That will check for a BOM and auto-decode, then try UTF8 with some checking logic... here's something similar I've been working on, but it would be cool to have this in the box.
// TODO: add process.nextTick foo so that this doesn't block as long
import iconv from 'iconv-lite';
// UTF8 BOM - 0xEF, 0xBB, 0xBF
// UTF16 BOM - 0xFE, 0xFF
// UTF16le BOM = 0xFF, 0xFE
export default async function readTextFromBuffer(buffer) {
if (!(buffer instanceof Buffer)) throw new Error('Input is not a buffer');
if (!buffer.length) return '';
const result = readWithBOM(buffer) || readWithoutBOM(buffer);
if (!result) return null;
return result;
}
function hasBOM(buffer, check) {
if (buffer.length < check.length) return false;
return check.every((v,i) => v == buffer[i]);
}
function readWithBOM(buffer) {
// Check for BOM
if (hasBOM(buffer, [0xEF, 0xBB, 0xBF])) {
// utf8
return buffer.toString('utf8');
}
if (hasBOM(buffer, [0xFE, 0xFF])) {
// utf16 be
return iconv.decode(buffer, 'utf16-be');
}
if (hasBOM(buffer, [0xFF, 0xFE])) {
// utf16 le
return iconv.decode(buffer, 'utf16-le');
}
}
function readWithoutBOM(buffer) {
let result;
try {
result = buffer.toString('utf8');
} catch(err) {
result = null;
}
// has unknown "unicode" character - try win1252 encoding
if (!result || result.includes(String.fromCharCode(65533))) {
// TODO: check the buffer for "unknown" directly before win1252 conversion
const result2 = iconv.decode(buffer, 'win1252');
// only replace if loaded as win1252 does *not* have unicode "unknown" character,
if (!result2.includes("\xEF\xBF\xBD")) {
return result2 || null;
}
}
return result || null;
}
Would be cool to have an "autoDecodeWithFallback" method...
That will check for a BOM and auto-decode, then try UTF8 with some checking logic... here's something similar I've been working on, but it would be cool to have this in the box.