openvanilla / McBopomofo

小麥注音輸入法
http://mcbopomofo.openvanilla.org/
MIT License
615 stars 76 forks source link

Audit the encoding column (with values "big5", "cns") in BPMFBase.txt #497

Open lukhnos opened 2 months ago

lukhnos commented 2 months ago

See https://github.com/openvanilla/McBopomofo/issues/491#issuecomment-2167228005 for details. There are entries that are off. We may want to audit its use and decide whether to rectity those entries, or simply remove the column from the data and the scripts altogether.

tianjianjiang commented 2 months ago

See #491 (comment) for details. There are entries that are off. We may want to audit its use and decide whether to rectity those entries, or simply remove the column from the data and the scripts altogether.

Please remind me if the encoding column is for "Ctrl + ` Key: Input Big 5 Code" and if so, how to test it. 🙏

zonble commented 1 week ago

I guess a simple script can do that.

// Run: `swift inspect.swift` under command line`

import CoreFoundation
import Foundation

func getCharCode(string: String, encoding: UInt32) -> String {
  return string.map { c in
    let swiftString = "\(c)"
    let cfString: CFString = swiftString as CFString
    var cStringBuffer = [CChar](repeating: 0, count: 4)
    CFStringGetCString(cfString, &cStringBuffer, 4, encoding)
    let data = Data(bytes: cStringBuffer, count: strlen(cStringBuffer))
    if data.count >= 2 {
      return "0x" + String(format: "%02x%02x", data[0], data[1]).uppercased()
    }
    return "N/A"
  }.joined(separator: " ")
}

let kCFStringEncodingBig5 = UInt32(0x0A03)
let kCFStringEncodingBig5_HKSCS_1999 = UInt32(0x0A06)
let kCFStringEncodingCNS_11643_92_P3 = UInt32(0x0653)

func main() throws {
  let path = "../BPMFBase.txt"
  let url = URL(fileURLWithPath: path)
  let text = try String(contentsOf: url, encoding: .utf8)
  let components = text.components(separatedBy: "\n")
  for line in components {
    let parts = line.components(separatedBy: " ")
    if parts.count != 5 {
      print(line)
      continue
    }
    let word = parts[0]
    let category = parts[4]
    let big5Code = getCharCode(string: word, encoding: kCFStringEncodingBig5)
    let big5HKSCScode = getCharCode(string: word, encoding: kCFStringEncodingBig5_HKSCS_1999)
    let cnsCode = getCharCode(string: word, encoding: kCFStringEncodingCNS_11643_92_P3)
    if category == "big5" {
      if big5Code == "N/A" && big5HKSCScode == "N/A" {
        print("\(word) is not in big5 and big5 HKSCS")
      }
    } else if category == "cns" {
      if cnsCode != "N/A" {
        print("\(word) is not CNS")
      }
      if big5Code != "N/A" {
        print("\(word) can be big5")
      } else if big5HKSCScode != "N/A" {
        print("\(word) can be big5 HKSCS")
      }
    } else if category == "utf8" {
      if big5Code != "N/A" {
        print("\(word) can be big5")
      } else if big5HKSCScode != "N/A" {
        print("\(word) can be big5 HKSCS")
      } else if cnsCode != "N/A" {
        print("\(word) can be CNS")
      }
    }
  }
}

try? main()