AbsaOSS / cobrix

A COBOL parser and Mainframe/EBCDIC data source for Apache Spark
Apache License 2.0
136 stars 78 forks source link

new code pages cp838,cp870,cp1025 #567

Closed sree018 closed 1 year ago

sree018 commented 1 year ago

Background

I am receiving some files with below code pages and these pages are not available in framework.

Feature

https://en.everybodywiki.com/EBCDIC_838 https://en.everybodywiki.com/EBCDIC_870 https://en.everybodywiki.com/EBCDIC_1025

The above mentioned code pages belong to sbcs https://www.ibm.com/docs/en/i/7.3?topic=information-national-language-keyboard-types-sbcs-code-pages

can you please add these code pages for future release?

yruslan commented 1 year ago

Looks good. A help on this would be really appreciated. If you can, can you create code page tables alongside: https://github.com/AbsaOSS/cobrix/blob/master/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage875.scala ? as CodePage838.scala, etc

It would save me a lot of time and will speed things up.

sree018 commented 1 year ago

@yruslan

Please find attached code pages for cp838,cp870,cp1025 and let me know if code pages have any translation issues.

CodePage

/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.cobrix.cobol.parser.encoding.codepage

import za.co.absa.cobrix.cobol.internal.Logging

/**
  * A trait for generalizing EBCDIC to ASCII conversion tables for different EBCDIC code pages.
  */
abstract class CodePage extends Serializable {
  /**
    * A short name is used to distinguish between different code pages, so it must be unique
    */
  def codePageShortName: String

  /**
    * Each class inherited from CodePage should provide its own conversion table
    */
  protected def ebcdicToAsciiMapping: Array[Char]

  /**
    * Gets a mapping table for EBCDIC to ASCII conversions. Uses underlying protected abstract method to get
    * the actual table. Checks that the size of the mapping arrays is exactly 256 elements.
    *
    * An EBCDIC to ASCII conversion table is represented as an array of characters.
    * For each EBCDIC character encoded as an index of the array there is a UNICODE symbol represented as `Char`.
    *
    * @return An EBCDIC to ASCII conversion table as an array of chars
    */
  @throws(classOf[IllegalArgumentException])
  final def getEbcdicToAsciiMapping: Array[Char] = {
    val ConversionTableElements = 256
    val table = ebcdicToAsciiMapping
    if (table.length != ConversionTableElements) {
      throw new IllegalArgumentException(
        s"An EBCDIC to ASCII conversion table should have exactly $ConversionTableElements elements. It has ${table.length} elements.")
    }
    table
  }
}

object CodePage extends Logging{

  def getCodePageByName(codePageName: String): CodePage = {
    codePageName match {
      case "common"          => new CodePageCommon
      case "common_extended" => new CodePageCommonExt
      case "cp037"           => new CodePage037
      case "cp037_extended"  => new CodePage037Ext
      case "cp875"           => new CodePage875
      case "cp1047"          => new CodePage1047
      case "cp838"          => new CodePage838
      case "cp870"          => new CodePage870
      case "cp1025"          => new CodePage1025
      case codePage => throw new IllegalArgumentException(s"The code page '$codePage' is not one of the builtin EBCDIC code pages.")
    }
  }

  def getCodePageByClass(codePageClass: String): CodePage = {
    logger.info(s"Instantiating code page class: $codePageClass")
    Class.forName(codePageClass,
                  true,
                  Thread.currentThread().getContextClassLoader)
      .newInstance()
      .asInstanceOf[CodePage]
  }

}

codepage838

package za.co.absa.cobrix.cobol.parser.encoding.codepage

/**
  * EBCDIC code page with support for Thai script used in IBM mainframes
  */
class CodePage838 extends CodePage {
  override def codePageShortName: String = "cp838"

  override protected def ebcdicToAsciiMapping: Array[Char] = {
    /* This is the EBCDIC Code Page 838 to ASCII conversion table with non-printable characters mapping
       from https://en.everybodywiki.com/EBCDIC_838 */
    val ebcdic2ascii: Array[Char] = {
      val clf = '\r'
      val ccr = '\n'
      val spc = ' '
      val qts = '\''
      val qtd = '\"'
      val bsh = '\\'
      val code01 = '\u0E48'
      val code02 = '\u0E4E'
      val code03 = '\u0E31'
      val code04 = '\u0E34'
      val code05 = '\u0E49'
      val code06 = '\u0E35'
      val code07 = '\u0E36'
      val code08 = '\u0E37'
      val code09 = '\u0E38'
      val code10 = '\u0E39'
      val code11 = '\u0E3A'
      val code12 = '\u0E47'
      val code13 = '\u0E48'
      val code14 = '\u0E49'
      val code15 = '\u0E4A'
      val code16 = '\u0E4B'
      val code17 = '\u0E4C'
      val code18 = '\u0E4D'

      Array[Char](
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, //   0 -  15
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  16 -  31
        spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  32 -  47
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  48 -  63
        spc, spc, 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', '[', '¢', '.', '<', '(', '+', '|', //  64 -  79
        '&', code01, 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', ']', '!', '$', '*', ')', ';', '¬', //  80 -  95
        '-', '/', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', '^', '¦', ',', '%', '_', '>', '?', //  96 - 111
        '฿', code02, 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127
        '๏', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', // 128 - 143
        '๚', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', // 144 - 159
        '๛', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', // 160 - 175
        '๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', 'ฯ', 'ะ', code03, 'า', 'ำ', code04, // 176 - 191
        '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', code05, code06, code07, code08, code09, code10, // 192 - 207
        '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', code11, 'เ', 'แ', 'โ', 'ใ', 'ไ', // 208 - 223
        bsh, code15, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'ๅ', 'ๆ', code12, code13, code14, code15, // 224 - 239
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', code16, code17, code18, code16, code17, spc) // 240 - 255
    }
    ebcdic2ascii
  }

}

codePage 870

package za.co.absa.cobrix.cobol.parser.encoding.codepage

/**
  * EBCDIC code page with full Latin-2-charset used in IBM mainframes
  * in Albania, Bosnia and Herzegovina, Croatia, Czech Republic, Hungary, Poland, Romania, Slovakia, and Slovenia
  */
class CodePage870 extends CodePage {
  override def codePageShortName: String = "cp870"

  override protected def ebcdicToAsciiMapping: Array[Char] = {
    /* This is the EBCDIC Code Page 870 to ASCII conversion table with non-printable characters mapping
       from https://en.everybodywiki.com/EBCDIC_870 */
    val ebcdic2ascii: Array[Char] = {
      val clf = '\r'
      val ccr = '\n'
      val spc = ' '
      val qts = '\''
      val qtd = '\"'
      val bsh = '\\'

      Array[Char](
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, //   0 -  15
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  16 -  31
        spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  32 -  47
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  48 -  63
        spc, spc, 'â', 'ä', 'ţ', 'á', 'ă', 'č', 'ç', 'ć', '[', '.', '<', '(', '+', '!', //  64 -  79
        '&', 'é', 'ę', 'ë', 'ů', 'í', 'î', 'ľ', 'ĺ', 'ß', ']', '$', '*', ')', ';', '^', //  80 -  95
        '-', '/', 'Â', 'Ä', '˝', 'Á', 'Ă', 'Č', 'Ç', 'Ć', '|', ',', '%', '_', '>', '?', //  96 - 111
        'ˇ', 'É', 'Ę', 'Ë', 'Ů', 'Í', 'Î', 'Ľ', 'Ĺ', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127
        '˘', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ś', 'ň', 'đ', 'ý', 'ř', 'ş', // 128 - 143
        '˚', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ł', 'ń', 'š', '¸', '˛', '¤', // 144 - 159
        'ą', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ś', 'Ň', 'Đ', 'Ý', 'Ř', 'Ş', // 160 - 175
        '˙', 'Ą', 'ż', 'Ţ', 'Ż', '§', 'ž', 'ź', 'Ž', 'Ź', 'Ł', 'Ń', 'Š', '¨', '´', '×', // 176 - 191
        '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', spc, 'ô', 'ö', 'ŕ', 'ó', 'ő', // 192 - 207
        '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'Ě', 'ű', 'ü', 'ť', 'ú', 'ě', // 208 - 223
        bsh, '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'ď', 'Ô', 'Ö', 'Ŕ', 'Ó', 'Ő', // 224 - 239
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'Ď', 'Ű', 'Ü', 'Ť', 'Ú', spc) // 240 - 255
    }
    ebcdic2ascii
  }
}

codepage1025

package za.co.absa.cobrix.cobol.parser.encoding.codepage

/**
  * EBCDIC code page with full Cyrillic-charset
  */
class CodePage1025 extends CodePage {
  override def codePageShortName: String = "cp1025"

  override protected def ebcdicToAsciiMapping: Array[Char] = {
    /* This is the EBCDIC Code Page 1025 to ASCII conversion table with non-printable characters mapping
       from https://en.everybodywiki.com/EBCDIC_1025  */
    val ebcdic2ascii: Array[Char] = {
      val clf = '\r'
      val ccr = '\n'
      val spc = ' '
      val qts = '\''
      val qtd = '\"'
      val bsh = '\\'

      Array[Char](
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, //   0 -  15
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  16 -  31
        spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  32 -  47
        spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, //  48 -  63
        spc, spc, 'ђ', 'ѓ', 'ё', 'є', 'ѕ', 'і', 'ї', 'ј', '[', '.', '<', '(', '+', '!', //  64 -  79
        '&', 'љ', 'њ', 'ћ', 'ќ', 'ў', 'џ', 'Ъ', '№', 'Ђ', ']', '$', '*', ')', ';', '^', //  80 -  95
        '-', '/', 'Ѓ', 'Ё', 'Є', 'Ѕ', 'І', 'Ї', 'Ј', 'Љ', '|', ',', '%', '_', '>', '?', //  96 - 111
        'Њ', 'Ћ', 'Ќ', spc, 'Ў', 'Џ', 'ю', 'а', 'б', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127
        'ц', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'д', 'е', 'ф', 'г', 'х', 'и', // 128 - 143
        'й', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'к', 'л', 'м', 'н', 'о', 'п', // 144 - 159
        'я', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'р', 'с', 'т', 'у', 'ж', 'в', // 160 - 175
        'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ', 'Ю', 'А', 'Б', 'Ц', 'Д', 'Е', 'Ф', 'Г', // 176 - 191
        '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'Х', 'И', 'Й', 'К', 'Л', 'М', // 192 - 207
        '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'Н', 'О', 'П', 'Я', 'Р', 'С', // 208 - 223
        bsh, '§', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Т', 'У', 'Ж', 'В', 'Ь', 'Ы', // 224 - 239
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'З', 'Ш', 'Э', 'Щ', 'Ч', spc) // 240 - 255
    }
    ebcdic2ascii
  }
}
yruslan commented 1 year ago

Thanks a lot for the contribution. Will add to Cobrix shortly

yruslan commented 1 year ago

Thanks a lot for the contribution! It is merged and will be part of the next release.

yruslan commented 1 year ago

This should be available at 2.6.3 released yesterday.