ashtuchkin / iconv-lite

Convert character encodings in pure javascript.
MIT License
3.07k stars 282 forks source link

question: how can I contribute? #201

Open fabienjuif opened 6 years ago

fabienjuif commented 6 years ago

Hi @ashtuchkin

I wanted to add the cp1147 parsing in your lib. How can I do that?

I know the codepage layout from here: https://en.wikipedia.org/wiki/EBCDIC_297 I wrote this simple function on top of this wikipedia page:

Array.from(document.querySelectorAll('table > tbody > tr > td > span')).map(node => node.innerText).map((value, index) => {
  let fixedIndex = index
  if (index >= 48) fixedIndex += 2
  if (index >= 60) fixedIndex += 1

  const line = Math.floor(fixedIndex / 16)
  const column = ((fixedIndex / 16) - line) * 16
  return { [`${line.toString(16)}${column.toString(16)}`]: value }
})
.reduce(
  (acc, curr) => ({ ...acc, ...curr }),
    {}
)

Which produce this (I have commented special character for now, but I will map them manually after):

const matrix = {
  //FIXME: "10": "DLE",
  //FIXME: "11": "DC1",
  //FIXME: "12": "DC2",
  //FIXME: "13": "DC3",
  //FIXME: "14": "res/enp",
  //FIXME: "15": "NL",
  //FIXME: "16": "BS",
  //FIXME: "17": "POC",
  //FIXME: "18": "CAN",
  //FIXME: "19": "EM",
  //FIXME: "20": "DS",
  //FIXME: "21": "SOS",
  //FIXME: "22": "FS",
  //FIXME: "23": "WUS",
  //FIXME: "24": "byp/imp",
  //FIXME: "25": "LF",
  //FIXME: "26": "ETB",
  //FIXME: "27": "ESC",
  //FIXME: "28": "SA",
  //FIXME: "29": "SFE",
  //FIXME: "32": "SYN",
  //FIXME: "33": "IR",
  //FIXME: "34": "PP",
  //FIXME: "35": "TRN",
  //FIXME: "36": "NBS",
  //FIXME: "37": "EOT",
  //FIXME: "38": "SBS",
  //FIXME: "39": "IT",
  //FIXME: "40": "SP",
  //FIXME: "41": "RSP",
  "42": "â",
  "43": "ä",
  "44": "@",
  "45": "á",
  "46": "ã",
  "47": "å",
  "48": "\\",
  "49": "ñ",
  "50": "&",
  "51": "{",
  "52": "ê",
  "53": "ë",
  "54": "}",
  "55": "í",
  "56": "î",
  "57": "ï",
  "58": "ì",
  "59": "ß",
  "60": "-",
  "61": "/",
  "62": "Â",
  "63": "Ä",
  "64": "À",
  "65": "Á",
  "66": "Ã",
  "67": "Å",
  "68": "Ç",
  "69": "Ñ",
  "70": "ø",
  "71": "É",
  "72": "Ê",
  "73": "Ë",
  "74": "È",
  "75": "Í",
  "76": "Î",
  "77": "Ï",
  "78": "Ì",
  "79": "µ",
  "80": "Ø",
  "81": "a",
  "82": "b",
  "83": "c",
  "84": "d",
  "85": "e",
  "86": "f",
  "87": "g",
  "88": "h",
  "89": "i",
  "90": "[",
  "91": "j",
  "92": "k",
  "93": "l",
  "94": "m",
  "95": "n",
  "96": "o",
  "97": "p",
  "98": "q",
  "99": "r",
  //FIXME: "00": "NUL",
  //FIXME: "01": "SOH",
  //FIXME: "02": "STX",
  //FIXME: "03": "ETX",
  //FIXME: "04": "SEL",
  //FIXME: "05": "HT",
  //FIXME: "06": "RNL",
  //FIXME: "07": "DEL",
  //FIXME: "08": "GE",
  //FIXME: "09": "SPS",
  //FIXME: "0a": "RPT",
  //FIXME: "0b": "VT",
  //FIXME: "0c": "FF",
  //FIXME: "0d": "CR",
  //FIXME: "0e": "SO",
  //FIXME: "0f": "SI",
  //FIXME: "1a": "UBS",
  //FIXME: "1b": "CU1",
  //FIXME: "1c": "IFS",
  //FIXME: "1d": "IGS",
  //FIXME: "1e": "IRS",
  //FIXME: "1f": "ius/itb",
  //FIXME: "2a": "sm/sw",
  //FIXME: "2b": "CSP",
  //FIXME: "2c": "MFA",
  //FIXME: "2d": "ENQ",
  //FIXME: "2e": "ACK",
  //FIXME: "2f": "BEL",
  //FIXME: "3a": "RFF",
  //FIXME: "3b": "CU3",
  //FIXME: "3c": "DC4",
  //FIXME: "3d": "NAK",
  //FIXME: "3f": "SUB",
  "4a": "°",
  "4b": ".",
  "4c": "<",
  "4d": "(",
  "4e": "+",
  "4f": "!",
  "5a": "§",
  "5b": "$",
  "5c": "*",
  "5d": ")",
  "5e": ";",
  "5f": "^",
  "6a": "ù",
  "6b": ",",
  "6c": "%",
  "6d": "_",
  "6e": ">",
  "6f": "?",
  "7a": ":",
  "7b": "£",
  "7c": "à",
  "7d": "'",
  "7e": "=",
  "7f": "\"",
  "8a": "«",
  "8b": "»",
  "8c": "ð",
  "8d": "ý",
  "8e": "þ",
  "8f": "±",
  "9a": "ª",
  "9b": "º",
  "9c": "æ",
  "9d": "¸",
  "9e": "Æ",
  "9f": "¤",
  "a0": "`",
  "a1": "¨",
  "a2": "s",
  "a3": "t",
  "a4": "u",
  "a5": "v",
  "a6": "w",
  "a7": "x",
  "a8": "y",
  "a9": "z",
  "aa": "¡",
  "ab": "¿",
  "ac": "Ð",
  "ad": "Ý",
  "ae": "Þ",
  "af": "®",
  "b0": "¢",
  "b1": "#",
  "b2": "¥",
  "b3": "·",
  "b4": "©",
  "b5": "]",
  "b6": "¶",
  "b7": "¼",
  "b8": "½",
  "b9": "¾",
  "ba": "¬",
  "bb": "|",
  "bc": "¯",
  "bd": "~",
  "be": "´",
  "bf": "×",
  "c0": "é",
  "c1": "A",
  "c2": "B",
  "c3": "C",
  "c4": "D",
  "c5": "E",
  "c6": "F",
  "c7": "G",
  "c8": "H",
  "c9": "I",
  //FIXME: "ca": "SHY",
  "cb": "ô",
  "cc": "ö",
  "cd": "ò",
  "ce": "ó",
  "cf": "õ",
  "d0": "è",
  "d1": "J",
  "d2": "K",
  "d3": "L",
  "d4": "M",
  "d5": "N",
  "d6": "O",
  "d7": "P",
  "d8": "Q",
  "d9": "R",
  "da": "¹",
  "db": "û",
  "dc": "ü",
  "dd": "¦",
  "de": "ú",
  "df": "ÿ",
  "e0": "ç",
  "e1": "÷",
  "e2": "S",
  "e3": "T",
  "e4": "U",
  "e5": "V",
  "e6": "W",
  "e7": "X",
  "e8": "Y",
  "e9": "Z",
  "ea": "²",
  "eb": "Ô",
  "ec": "Ö",
  "ed": "Ò",
  "ee": "Ó",
  "ef": "Õ",
  "f0": "0",
  "f1": "1",
  "f2": "2",
  "f3": "3",
  "f4": "4",
  "f5": "5",
  "f6": "6",
  "f7": "7",
  "f8": "8",
  "f9": "9",
  "fa": "³",
  "fb": "Û",
  "fc": "Ü",
  "fd": "Ù",
  "fe": "Ú",
  //FIXME: "ff": "EO"

  // this are overrided
  '40': ' ',
  '18': '', // no data, TODO: ask for it (CAN)
}

I saw on this PR that an chars array can be used: https://github.com/ashtuchkin/iconv-lite/pull/196/files#diff-6072bbae0aa1a9b4406cc0e3b969095fR22

Should I use this? How do I know what is the first character mapping?

Thank you 👍

PS: feel free to RTFM me! With links please ;)

fabienjuif commented 6 years ago

And this is the example I used to test the conversion matrix:

const run = async () => {
  const raw = await readFile('./cp1147.txt')
  const toArray = Array.from(raw)

  return toArray.map((char) => {
    const unicodeChar = matrice[char.toString(16)]
    if (unicodeChar === undefined) return '?'

    return unicodeChar
  })
}

run()
  .then(codes => {
    console.log(codes.join(''))
  })
fabienjuif commented 6 years ago

related to #111

yosion-p commented 3 years ago

Hi, Mechanism to add encodings from external npm packages #253 , I hope it helps.