AceCentre / Echo

Echo is an AAC app designed for those with a visual difficulty and physical difficulty to access communication.
https://docs.acecentre.org.uk/products/v/echo
GNU General Public License v3.0
0 stars 0 forks source link

[V1] Make prediction work in more languages #2

Closed willwade closed 11 months ago

willwade commented 11 months ago

Not sure if this is useful

import Foundation

class Node {
    var children: [Character: Node] = [:]
    var frequency: Int = 0
    var words: [String: Int] = [:]
}

class PPM {
    let root = Node()

    func train(fromFile fileURL: URL) {
        do {
            let text = try String(contentsOf: fileURL, encoding: .utf8)
            let words = text.split(separator: " ")
            for word in words {
                var currentNode = root
                for char in word {
                    currentNode.frequency += 1
                    if currentNode.children[char] == nil {
                        currentNode.children[char] = Node()
                    }
                    currentNode = currentNode.children[char]!
                }
                currentNode.frequency += 1
                currentNode.words[String(word), default: 0] += 1
            }
        } catch {
            print("Error reading file: \(error)")
        }
    }

    func predict(context: String, topN: Int) -> (letters: [(Character, Double)], words: [(String, Double)]) {
        let chars = Array(context)
        var currentNode = root
        for char in chars.reversed() {
            if let nextNode = currentNode.children[char] {
                currentNode = nextNode
            } else {
                return ([], [])
            }
        }

        let topLetters = mostFrequentChildren(of: currentNode, topN: topN)
        let topWords = mostFrequentWords(of: currentNode, topN: topN)

        return (topLetters, topWords)
    }

    private func mostFrequentChildren(of node: Node, topN: Int) -> [(Character, Double)] {
        var predictions: [(Character, Double)] = []
        let totalFrequency = Double(node.frequency)

        let sortedChildren = node.children.sorted { $0.value.frequency > $1.value.frequency }

        for (char, childNode) in sortedChildren.prefix(topN) {
            let likelihood = Double(childNode.frequency) / totalFrequency
            predictions.append((char, likelihood))
        }

        return predictions
    }

    private func mostFrequentWords(of node: Node, topN: Int) -> [(String, Double)] {
        var predictions: [(String, Double)] = []
        let totalFrequency = Double(node.frequency)

        let sortedWords = node.words.sorted { $0.value > $1.value }

        for (word, freq) in sortedWords.prefix(topN) {
            let likelihood = Double(freq) / totalFrequency
            predictions.append((word, likelihood))
        }

        return predictions
    }
}

// Usage
let ppm = PPM()

// Replace this URL with the actual file URL
if let fileURL = URL(string: "path/to/your/text/file.txt") {
    ppm.train(fromFile: fileURL)
}

let (topLetters, topWords) = ppm.predict(context: "hell", topN: 6)

print("Top letter predictions:")
for (char, likelihood) in topLetters {
    print("Next letter: \(char), Likelihood: \(likelihood)")
}

print("\nTop word predictions:")
for (word, likelihood) in topWords {
    print("Next word: \(word), Likelihood: \(likelihood)")
}

So really you'd want to update that training text continually too - but you ideally need a way for people correcting this training text. Something that was easy enough with dasher although undcoumented. You literally edit the text file. Also note this for autocorrection - but not sure how we would implement this

extension PPM {
    // Generate candidate words by swapping adjacent characters
    func generateCandidates(word: String) -> [String] {
        var candidates: [String] = []
        var chars = Array(word)

        for i in 0..<(chars.count - 1) {
            chars.swapAt(i, i + 1)
            candidates.append(String(chars))
            chars.swapAt(i, i + 1)  // Swap back to original
        }

        return candidates
    }

    // Autocorrect a misspelled word
    func autocorrect(word: String, context: String, topN: Int) -> [(String, Double)] {
        let candidates = generateCandidates(word: word)
        var scoredCandidates: [(String, Double)] = []

        for candidate in candidates {
            if let likelihoods = predict(context: candidate, topN: 1).words.first?.value {
                scoredCandidates.append((candidate, likelihoods))
            }
        }

        return scoredCandidates.sorted { $0.1 > $1.1 }.prefix(topN)
    }
}

// Usage
let ppm = PPM()
ppm.train(text: "hello world hello everyone")

// Autocorrect the misspelled word "helo" given the context "hel"
let corrections = ppm.autocorrect(word: "helo", context: "hel", topN: 3)

print("Autocorrection suggestions:")
for (word, likelihood) in corrections {
    print("Word: \(word), Likelihood: \(likelihood)")
}

Here The generateCandidates(word:) method generates candidate words by swapping adjacent characters. This is a very simplistic way to generate candidates; in a real-world application, you might use more sophisticated techniques like Damerau-Levenshtein distance. The autocorrect(word:context:topN:) method takes a misspelled word and a context, generates candidates, and then uses the PPM model to rank these candidates based on their likelihood given the context.

gavinhenderson commented 11 months ago

Im closing this as prediction now does work in more languages. However, I will open a new issue to improve prediction and link back to your notes