MuShare / EasyJapanese

Learn Japanese
MIT License
2 stars 1 forks source link

罗马字标音 #15

Open yanyin1986 opened 5 years ago

yanyin1986 commented 5 years ago
let text = "8年前、東京電力福島第一原発で事故がありました。事故のあと、福島県では、放射線を出す物質で汚れた土や草、木などを取る作業をしています。"
        print(text)
        let tokens = Tokenizer.tokenize(text: text)
        for t in tokens {
            let locale = CFLocaleCreate(kCFAllocatorDefault,
                                        CFLocaleIdentifier("japanese" as CFString))
            let tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault,
                                                    t as CFString,
                                                    CFRangeMake(0, t.count),
                                                    kCFStringTokenizerUnitWord,
                                                    locale)!
            var result = CFStringTokenizerAdvanceToNextToken(tokenizer)
            while result != .none {
                let r = CFStringTokenizerCopyCurrentTokenAttribute(tokenizer,
                                                                   kCFStringTokenizerAttributeLatinTranscription)
                if let rr = r {
                    let rrr = (r as! String).applyingTransform(.latinToHiragana, reverse: false)
                    print("\(t) => \(rr) => \(rrr!)")
                } else {
                    break
                }
                result = CFStringTokenizerAdvanceToNextToken(tokenizer)
            }
        }
struct Tokenizer {

    // MARK: - Publics
    static func tokenize(text: String) -> [String] {
        var tokens: [String] = []
        text.enumerateSubstrings(in: text.startIndex ..< text.endIndex, options: .byWords) { (subString, _, _, _) in
            if let substring = subString {
                tokens.append(substring)
            }
        }
        return tokens
    }
}