import Foundation
class Node {
var children: [Character: Node] = [:]
var frequency: Int = 0
var words: [String: Int] = [:]
}
class PPM {
let root = Node()
func train(fromFile fileURL: URL) {
do {
let text = try String(contentsOf: fileURL, encoding: .utf8)
let words = text.split(separator: " ")
for word in words {
var currentNode = root
for char in word {
currentNode.frequency += 1
if currentNode.children[char] == nil {
currentNode.children[char] = Node()
}
currentNode = currentNode.children[char]!
}
currentNode.frequency += 1
currentNode.words[String(word), default: 0] += 1
}
} catch {
print("Error reading file: \(error)")
}
}
func predict(context: String, topN: Int) -> (letters: [(Character, Double)], words: [(String, Double)]) {
let chars = Array(context)
var currentNode = root
for char in chars.reversed() {
if let nextNode = currentNode.children[char] {
currentNode = nextNode
} else {
return ([], [])
}
}
let topLetters = mostFrequentChildren(of: currentNode, topN: topN)
let topWords = mostFrequentWords(of: currentNode, topN: topN)
return (topLetters, topWords)
}
private func mostFrequentChildren(of node: Node, topN: Int) -> [(Character, Double)] {
var predictions: [(Character, Double)] = []
let totalFrequency = Double(node.frequency)
let sortedChildren = node.children.sorted { $0.value.frequency > $1.value.frequency }
for (char, childNode) in sortedChildren.prefix(topN) {
let likelihood = Double(childNode.frequency) / totalFrequency
predictions.append((char, likelihood))
}
return predictions
}
private func mostFrequentWords(of node: Node, topN: Int) -> [(String, Double)] {
var predictions: [(String, Double)] = []
let totalFrequency = Double(node.frequency)
let sortedWords = node.words.sorted { $0.value > $1.value }
for (word, freq) in sortedWords.prefix(topN) {
let likelihood = Double(freq) / totalFrequency
predictions.append((word, likelihood))
}
return predictions
}
}
// Usage
let ppm = PPM()
// Replace this URL with the actual file URL
if let fileURL = URL(string: "path/to/your/text/file.txt") {
ppm.train(fromFile: fileURL)
}
let (topLetters, topWords) = ppm.predict(context: "hell", topN: 6)
print("Top letter predictions:")
for (char, likelihood) in topLetters {
print("Next letter: \(char), Likelihood: \(likelihood)")
}
print("\nTop word predictions:")
for (word, likelihood) in topWords {
print("Next word: \(word), Likelihood: \(likelihood)")
}
So really you'd want to update that training text continually too - but you ideally need a way for people correcting this training text. Something that was easy enough with dasher although undcoumented. You literally edit the text file.
Also note this for autocorrection - but not sure how we would implement this
extension PPM {
// Generate candidate words by swapping adjacent characters
func generateCandidates(word: String) -> [String] {
var candidates: [String] = []
var chars = Array(word)
for i in 0..<(chars.count - 1) {
chars.swapAt(i, i + 1)
candidates.append(String(chars))
chars.swapAt(i, i + 1) // Swap back to original
}
return candidates
}
// Autocorrect a misspelled word
func autocorrect(word: String, context: String, topN: Int) -> [(String, Double)] {
let candidates = generateCandidates(word: word)
var scoredCandidates: [(String, Double)] = []
for candidate in candidates {
if let likelihoods = predict(context: candidate, topN: 1).words.first?.value {
scoredCandidates.append((candidate, likelihoods))
}
}
return scoredCandidates.sorted { $0.1 > $1.1 }.prefix(topN)
}
}
// Usage
let ppm = PPM()
ppm.train(text: "hello world hello everyone")
// Autocorrect the misspelled word "helo" given the context "hel"
let corrections = ppm.autocorrect(word: "helo", context: "hel", topN: 3)
print("Autocorrection suggestions:")
for (word, likelihood) in corrections {
print("Word: \(word), Likelihood: \(likelihood)")
}
Here The generateCandidates(word:) method generates candidate words by swapping adjacent characters. This is a very simplistic way to generate candidates; in a real-world application, you might use more sophisticated techniques like Damerau-Levenshtein distance.
The autocorrect(word:context:topN:) method takes a misspelled word and a context, generates candidates, and then uses the PPM model to rank these candidates based on their likelihood given the context.
Not sure if this is useful
So really you'd want to update that training text continually too - but you ideally need a way for people correcting this training text. Something that was easy enough with dasher although undcoumented. You literally edit the text file. Also note this for autocorrection - but not sure how we would implement this
Here The generateCandidates(word:) method generates candidate words by swapping adjacent characters. This is a very simplistic way to generate candidates; in a real-world application, you might use more sophisticated techniques like Damerau-Levenshtein distance. The autocorrect(word:context:topN:) method takes a misspelled word and a context, generates candidates, and then uses the PPM model to rank these candidates based on their likelihood given the context.