There is a possibility that some bus stop name of bus line names has some error.
The filtering feature should take account of common typing errors.
eg. Akorondrano instead of Ankorondrano
Implementation details
Use Levenshtein distance
Take possible edits (splits, deletes, transposes, replaces, inserts) into account
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""Spelling Corrector.
Copyright 2007 Peter Norvig.
Open source code under MIT license: http://www.opensource.org/licenses/mit-license.php
"""
import re, collections
def words(text): return re.findall('[a-z]+', text.lower())
def train(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
NWORDS = train(words(file('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):
s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in s if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
replaces = [a + c + b[1:] for a, b in s for c in alphabet if b]
inserts = [a + c + b for a, b in s for c in alphabet]
return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def known(words): return set(w for w in words if w in NWORDS)
def correct(word):
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
return max(candidates, key=NWORDS.get)
################ Testing code from here on ################
def spelltest(tests, bias=None, verbose=False):
import time
n, bad, unknown, start = 0, 0, 0, time.clock()
if bias:
for target in tests: NWORDS[target] += bias
for target,wrongs in tests.items():
for wrong in wrongs.split():
n += 1
w = correct(wrong)
if w!=target:
bad += 1
unknown += (target not in NWORDS)
if verbose:
print 'correct(%r) => %r (%d); expected %r (%d)' % (
wrong, w, NWORDS[w], target, NWORDS[target])
return dict(bad=bad, n=n, bias=bias, pct=int(100. - 100.*bad/n),
unknown=unknown, secs=int(time.clock()-start) )
Description
There is a possibility that some bus stop name of bus line names has some error.
The filtering feature should take account of common typing errors.
eg. Akorondrano instead of Ankorondrano
Implementation details