feature - autoDecodeWithFallback

Would be cool to have an "autoDecodeWithFallback" method...

iconv.autoDecode(buffer, [fallbackEncoding = 'win1252'])

That will check for a BOM and auto-decode, then try UTF8 with some checking logic... here's something similar I've been working on, but it would be cool to have this in the box.

// TODO: add process.nextTick foo so that this doesn't block as long

import iconv from 'iconv-lite';

// UTF8 BOM - 0xEF, 0xBB, 0xBF
// UTF16 BOM - 0xFE, 0xFF
// UTF16le BOM = 0xFF, 0xFE

export default async function readTextFromBuffer(buffer) {
  if (!(buffer instanceof Buffer)) throw new Error('Input is not a buffer');
  if (!buffer.length) return '';

  const result = readWithBOM(buffer) || readWithoutBOM(buffer);
  if (!result) return null;

  return result;
}

function hasBOM(buffer, check) {
  if (buffer.length < check.length) return false;
  return check.every((v,i) => v == buffer[i]);
}

function readWithBOM(buffer) {
  // Check for BOM
  if (hasBOM(buffer, [0xEF, 0xBB, 0xBF])) {
    // utf8
    return buffer.toString('utf8');
  }

  if (hasBOM(buffer, [0xFE, 0xFF])) {
    // utf16 be
    return iconv.decode(buffer, 'utf16-be');
  }

  if (hasBOM(buffer, [0xFF, 0xFE])) {
    // utf16 le
    return iconv.decode(buffer, 'utf16-le');
  }
}

function readWithoutBOM(buffer) {
  let result;
  try {
    result = buffer.toString('utf8');
  } catch(err) {
    result = null;
  }

  // has unknown "unicode" character - try win1252 encoding
  if (!result || result.includes(String.fromCharCode(65533))) {
    // TODO: check the buffer for "unknown" directly before win1252 conversion
    const result2 = iconv.decode(buffer, 'win1252');

    // only replace if loaded as win1252 does *not* have unicode "unknown" character, 
    if (!result2.includes("\xEF\xBF\xBD")) {
      return result2 || null;
    }
  }
  return result || null;
}

ashtuchkin / iconv-lite

feature - autoDecodeWithFallback #127