[V1] Make prediction work in more languages

Not sure if this is useful

import Foundation

class Node {
    var children: [Character: Node] = [:]
    var frequency: Int = 0
    var words: [String: Int] = [:]
}

class PPM {
    let root = Node()

    func train(fromFile fileURL: URL) {
        do {
            let text = try String(contentsOf: fileURL, encoding: .utf8)
            let words = text.split(separator: " ")
            for word in words {
                var currentNode = root
                for char in word {
                    currentNode.frequency += 1
                    if currentNode.children[char] == nil {
                        currentNode.children[char] = Node()
                    }
                    currentNode = currentNode.children[char]!
                }
                currentNode.frequency += 1
                currentNode.words[String(word), default: 0] += 1
            }
        } catch {
            print("Error reading file: \(error)")
        }
    }

    func predict(context: String, topN: Int) -> (letters: [(Character, Double)], words: [(String, Double)]) {
        let chars = Array(context)
        var currentNode = root
        for char in chars.reversed() {
            if let nextNode = currentNode.children[char] {
                currentNode = nextNode
            } else {
                return ([], [])
            }
        }

        let topLetters = mostFrequentChildren(of: currentNode, topN: topN)
        let topWords = mostFrequentWords(of: currentNode, topN: topN)

        return (topLetters, topWords)
    }

    private func mostFrequentChildren(of node: Node, topN: Int) -> [(Character, Double)] {
        var predictions: [(Character, Double)] = []
        let totalFrequency = Double(node.frequency)

        let sortedChildren = node.children.sorted { $0.value.frequency > $1.value.frequency }

        for (char, childNode) in sortedChildren.prefix(topN) {
            let likelihood = Double(childNode.frequency) / totalFrequency
            predictions.append((char, likelihood))
        }

        return predictions
    }

    private func mostFrequentWords(of node: Node, topN: Int) -> [(String, Double)] {
        var predictions: [(String, Double)] = []
        let totalFrequency = Double(node.frequency)

        let sortedWords = node.words.sorted { $0.value > $1.value }

        for (word, freq) in sortedWords.prefix(topN) {
            let likelihood = Double(freq) / totalFrequency
            predictions.append((word, likelihood))
        }

        return predictions
    }
}

// Usage
let ppm = PPM()

// Replace this URL with the actual file URL
if let fileURL = URL(string: "path/to/your/text/file.txt") {
    ppm.train(fromFile: fileURL)
}

let (topLetters, topWords) = ppm.predict(context: "hell", topN: 6)

print("Top letter predictions:")
for (char, likelihood) in topLetters {
    print("Next letter: \(char), Likelihood: \(likelihood)")
}

print("\nTop word predictions:")
for (word, likelihood) in topWords {
    print("Next word: \(word), Likelihood: \(likelihood)")
}

So really you'd want to update that training text continually too - but you ideally need a way for people correcting this training text. Something that was easy enough with dasher although undcoumented. You literally edit the text file. Also note this for autocorrection - but not sure how we would implement this

extension PPM {
    // Generate candidate words by swapping adjacent characters
    func generateCandidates(word: String) -> [String] {
        var candidates: [String] = []
        var chars = Array(word)

        for i in 0..<(chars.count - 1) {
            chars.swapAt(i, i + 1)
            candidates.append(String(chars))
            chars.swapAt(i, i + 1)  // Swap back to original
        }

        return candidates
    }

    // Autocorrect a misspelled word
    func autocorrect(word: String, context: String, topN: Int) -> [(String, Double)] {
        let candidates = generateCandidates(word: word)
        var scoredCandidates: [(String, Double)] = []

        for candidate in candidates {
            if let likelihoods = predict(context: candidate, topN: 1).words.first?.value {
                scoredCandidates.append((candidate, likelihoods))
            }
        }

        return scoredCandidates.sorted { $0.1 > $1.1 }.prefix(topN)
    }
}

// Usage
let ppm = PPM()
ppm.train(text: "hello world hello everyone")

// Autocorrect the misspelled word "helo" given the context "hel"
let corrections = ppm.autocorrect(word: "helo", context: "hel", topN: 3)

print("Autocorrection suggestions:")
for (word, likelihood) in corrections {
    print("Word: \(word), Likelihood: \(likelihood)")
}

Here The generateCandidates(word:) method generates candidate words by swapping adjacent characters. This is a very simplistic way to generate candidates; in a real-world application, you might use more sophisticated techniques like Damerau-Levenshtein distance. The autocorrect(word:context:topN:) method takes a misspelled word and a context, generates candidates, and then uses the PPM model to rank these candidates based on their likelihood given the context.

AceCentre / Echo

[V1] Make prediction work in more languages #2