Closed sree018 closed 1 year ago
Looks good. A help on this would be really appreciated. If you can, can you create code page tables alongside: https://github.com/AbsaOSS/cobrix/blob/master/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage875.scala ? as CodePage838.scala, etc
It would save me a lot of time and will speed things up.
@yruslan
Please find attached code pages for cp838,cp870,cp1025 and let me know if code pages have any translation issues.
CodePage
/*
* Copyright 2018 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package za.co.absa.cobrix.cobol.parser.encoding.codepage
import za.co.absa.cobrix.cobol.internal.Logging
/**
* A trait for generalizing EBCDIC to ASCII conversion tables for different EBCDIC code pages.
*/
abstract class CodePage extends Serializable {
/**
* A short name is used to distinguish between different code pages, so it must be unique
*/
def codePageShortName: String
/**
* Each class inherited from CodePage should provide its own conversion table
*/
protected def ebcdicToAsciiMapping: Array[Char]
/**
* Gets a mapping table for EBCDIC to ASCII conversions. Uses underlying protected abstract method to get
* the actual table. Checks that the size of the mapping arrays is exactly 256 elements.
*
* An EBCDIC to ASCII conversion table is represented as an array of characters.
* For each EBCDIC character encoded as an index of the array there is a UNICODE symbol represented as `Char`.
*
* @return An EBCDIC to ASCII conversion table as an array of chars
*/
@throws(classOf[IllegalArgumentException])
final def getEbcdicToAsciiMapping: Array[Char] = {
val ConversionTableElements = 256
val table = ebcdicToAsciiMapping
if (table.length != ConversionTableElements) {
throw new IllegalArgumentException(
s"An EBCDIC to ASCII conversion table should have exactly $ConversionTableElements elements. It has ${table.length} elements.")
}
table
}
}
object CodePage extends Logging{
def getCodePageByName(codePageName: String): CodePage = {
codePageName match {
case "common" => new CodePageCommon
case "common_extended" => new CodePageCommonExt
case "cp037" => new CodePage037
case "cp037_extended" => new CodePage037Ext
case "cp875" => new CodePage875
case "cp1047" => new CodePage1047
case "cp838" => new CodePage838
case "cp870" => new CodePage870
case "cp1025" => new CodePage1025
case codePage => throw new IllegalArgumentException(s"The code page '$codePage' is not one of the builtin EBCDIC code pages.")
}
}
def getCodePageByClass(codePageClass: String): CodePage = {
logger.info(s"Instantiating code page class: $codePageClass")
Class.forName(codePageClass,
true,
Thread.currentThread().getContextClassLoader)
.newInstance()
.asInstanceOf[CodePage]
}
}
codepage838
package za.co.absa.cobrix.cobol.parser.encoding.codepage
/**
* EBCDIC code page with support for Thai script used in IBM mainframes
*/
class CodePage838 extends CodePage {
override def codePageShortName: String = "cp838"
override protected def ebcdicToAsciiMapping: Array[Char] = {
/* This is the EBCDIC Code Page 838 to ASCII conversion table with non-printable characters mapping
from https://en.everybodywiki.com/EBCDIC_838 */
val ebcdic2ascii: Array[Char] = {
val clf = '\r'
val ccr = '\n'
val spc = ' '
val qts = '\''
val qtd = '\"'
val bsh = '\\'
val code01 = '\u0E48'
val code02 = '\u0E4E'
val code03 = '\u0E31'
val code04 = '\u0E34'
val code05 = '\u0E49'
val code06 = '\u0E35'
val code07 = '\u0E36'
val code08 = '\u0E37'
val code09 = '\u0E38'
val code10 = '\u0E39'
val code11 = '\u0E3A'
val code12 = '\u0E47'
val code13 = '\u0E48'
val code14 = '\u0E49'
val code15 = '\u0E4A'
val code16 = '\u0E4B'
val code17 = '\u0E4C'
val code18 = '\u0E4D'
Array[Char](
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, // 0 - 15
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 16 - 31
spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 32 - 47
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 48 - 63
spc, spc, 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', '[', '¢', '.', '<', '(', '+', '|', // 64 - 79
'&', code01, 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', ']', '!', '$', '*', ')', ';', '¬', // 80 - 95
'-', '/', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', '^', '¦', ',', '%', '_', '>', '?', // 96 - 111
'฿', code02, 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127
'๏', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', // 128 - 143
'๚', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', // 144 - 159
'๛', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', // 160 - 175
'๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', 'ฯ', 'ะ', code03, 'า', 'ำ', code04, // 176 - 191
'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', code05, code06, code07, code08, code09, code10, // 192 - 207
'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', code11, 'เ', 'แ', 'โ', 'ใ', 'ไ', // 208 - 223
bsh, code15, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'ๅ', 'ๆ', code12, code13, code14, code15, // 224 - 239
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', code16, code17, code18, code16, code17, spc) // 240 - 255
}
ebcdic2ascii
}
}
codePage 870
package za.co.absa.cobrix.cobol.parser.encoding.codepage
/**
* EBCDIC code page with full Latin-2-charset used in IBM mainframes
* in Albania, Bosnia and Herzegovina, Croatia, Czech Republic, Hungary, Poland, Romania, Slovakia, and Slovenia
*/
class CodePage870 extends CodePage {
override def codePageShortName: String = "cp870"
override protected def ebcdicToAsciiMapping: Array[Char] = {
/* This is the EBCDIC Code Page 870 to ASCII conversion table with non-printable characters mapping
from https://en.everybodywiki.com/EBCDIC_870 */
val ebcdic2ascii: Array[Char] = {
val clf = '\r'
val ccr = '\n'
val spc = ' '
val qts = '\''
val qtd = '\"'
val bsh = '\\'
Array[Char](
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, // 0 - 15
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 16 - 31
spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 32 - 47
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 48 - 63
spc, spc, 'â', 'ä', 'ţ', 'á', 'ă', 'č', 'ç', 'ć', '[', '.', '<', '(', '+', '!', // 64 - 79
'&', 'é', 'ę', 'ë', 'ů', 'í', 'î', 'ľ', 'ĺ', 'ß', ']', '$', '*', ')', ';', '^', // 80 - 95
'-', '/', 'Â', 'Ä', '˝', 'Á', 'Ă', 'Č', 'Ç', 'Ć', '|', ',', '%', '_', '>', '?', // 96 - 111
'ˇ', 'É', 'Ę', 'Ë', 'Ů', 'Í', 'Î', 'Ľ', 'Ĺ', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127
'˘', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ś', 'ň', 'đ', 'ý', 'ř', 'ş', // 128 - 143
'˚', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ł', 'ń', 'š', '¸', '˛', '¤', // 144 - 159
'ą', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ś', 'Ň', 'Đ', 'Ý', 'Ř', 'Ş', // 160 - 175
'˙', 'Ą', 'ż', 'Ţ', 'Ż', '§', 'ž', 'ź', 'Ž', 'Ź', 'Ł', 'Ń', 'Š', '¨', '´', '×', // 176 - 191
'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', spc, 'ô', 'ö', 'ŕ', 'ó', 'ő', // 192 - 207
'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'Ě', 'ű', 'ü', 'ť', 'ú', 'ě', // 208 - 223
bsh, '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'ď', 'Ô', 'Ö', 'Ŕ', 'Ó', 'Ő', // 224 - 239
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'Ď', 'Ű', 'Ü', 'Ť', 'Ú', spc) // 240 - 255
}
ebcdic2ascii
}
}
codepage1025
package za.co.absa.cobrix.cobol.parser.encoding.codepage
/**
* EBCDIC code page with full Cyrillic-charset
*/
class CodePage1025 extends CodePage {
override def codePageShortName: String = "cp1025"
override protected def ebcdicToAsciiMapping: Array[Char] = {
/* This is the EBCDIC Code Page 1025 to ASCII conversion table with non-printable characters mapping
from https://en.everybodywiki.com/EBCDIC_1025 */
val ebcdic2ascii: Array[Char] = {
val clf = '\r'
val ccr = '\n'
val spc = ' '
val qts = '\''
val qtd = '\"'
val bsh = '\\'
Array[Char](
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, // 0 - 15
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 16 - 31
spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 32 - 47
spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 48 - 63
spc, spc, 'ђ', 'ѓ', 'ё', 'є', 'ѕ', 'і', 'ї', 'ј', '[', '.', '<', '(', '+', '!', // 64 - 79
'&', 'љ', 'њ', 'ћ', 'ќ', 'ў', 'џ', 'Ъ', '№', 'Ђ', ']', '$', '*', ')', ';', '^', // 80 - 95
'-', '/', 'Ѓ', 'Ё', 'Є', 'Ѕ', 'І', 'Ї', 'Ј', 'Љ', '|', ',', '%', '_', '>', '?', // 96 - 111
'Њ', 'Ћ', 'Ќ', spc, 'Ў', 'Џ', 'ю', 'а', 'б', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127
'ц', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'д', 'е', 'ф', 'г', 'х', 'и', // 128 - 143
'й', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'к', 'л', 'м', 'н', 'о', 'п', // 144 - 159
'я', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'р', 'с', 'т', 'у', 'ж', 'в', // 160 - 175
'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ', 'Ю', 'А', 'Б', 'Ц', 'Д', 'Е', 'Ф', 'Г', // 176 - 191
'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'Х', 'И', 'Й', 'К', 'Л', 'М', // 192 - 207
'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'Н', 'О', 'П', 'Я', 'Р', 'С', // 208 - 223
bsh, '§', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Т', 'У', 'Ж', 'В', 'Ь', 'Ы', // 224 - 239
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'З', 'Ш', 'Э', 'Щ', 'Ч', spc) // 240 - 255
}
ebcdic2ascii
}
}
Thanks a lot for the contribution. Will add to Cobrix shortly
Thanks a lot for the contribution! It is merged and will be part of the next release.
This should be available at 2.6.3 released yesterday.
Background
I am receiving some files with below code pages and these pages are not available in framework.
Feature
https://en.everybodywiki.com/EBCDIC_838 https://en.everybodywiki.com/EBCDIC_870 https://en.everybodywiki.com/EBCDIC_1025
The above mentioned code pages belong to sbcs https://www.ibm.com/docs/en/i/7.3?topic=information-national-language-keyboard-types-sbcs-code-pages
can you please add these code pages for future release?