Open marcolarosa opened 2 years ago
We will have a soundex search in the repository (not in the workspace) as in the Bates project. I have the python script that was used for this.
Soundex list This list to be used for making a search of Bates terms return as large a set of hits as possible
It will be useful in creating standard forms, but there will need to be ordering of any conversion rules to prevent rules feeding themselves, and to allow forms like ‘rr’ to occur (which, for searching pruposes, can be treated as ‘r’). For standardising spellings these rules will also have to take account of word-structure, with some sounds more likely to occur in initial or medial position.
Ignore all diacritics for the search.
Standard Bates
a ah, ar, aa, u
e
u oo, a
i ee
o u
p b, bb
t d, dh, rt, rd
k g, gg
m mm
n nn
ng gn
l lh, rl, ll
r rr
rr r
y
w
th dh
nh
lh
ny nj
iya eeia
uwa ooa, ua
j tj, ch, tch
rt rd
uwi ooee
ayi aiee
ayu aioo
aly ail
Vowel initial :
i yi
probably will need tweaking as we get more examples
"""
This scripts takes an XML (TEI) database with original spellings and a tab-separated list of character alternation pairs, and generates a new XML document with standardised variants inserted.
It runs over an entire directory, processing all files ending with .xml.
The -v option also gives a two-column output: original spelling
import sys, os, re, codecs from argparse import ArgumentParser, RawTextHelpFormatter from collections import defaultdict try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET from xml.dom import minidom ET.register_namespace('','http://www.tei-c.org/ns/1.0')
sys.stdout=codecs.getwriter('utf8')(sys.stdout) sys.stderr=codecs.getwriter('utf8')(sys.stderr)
def mapToSingleChar(word): mapping = {'rt':'T', 'rl':'L', 'rn':'N', 'th':'D', 'ly':'Y', 'ny':'J', 'nh':'M', 'rr':'R', 'ng':'G', } for key in mapping: origchar = key singlechar = mapping[key] word = word.replace(origchar,singlechar) return word
def mapToOrigChar(word): mapping = {'rt':'T', 'rl':'L', 'rn':'N', 'th':'D', 'ly':'Y', 'ny':'J', 'nh':'M', 'rr':'R', 'ng':'G', } for key in mapping: origchar = key singlechar = mapping[key] word = word.replace(singlechar,origchar) return word
def readCharGroupFile(filePath):
groups = defaultdict(list)
with open(filePath) as handle:
for line in handle:
if line:
if not line.startswith('#'):
columns = line.strip('\n').split('\t')
col1 = mapToSingleChar(columns[0])
col2 = mapToSingleChar(columns[1])
groups[col1].append(col2)
return groups
def variant(word,groups): mappedWord = mapToSingleChar(word) variants = set([mappedWord]) for search in groups: if not re.search(search, mappedWord): continue for replace in groups[search]: for savedvar in list(variants): pat = replace.replace('\A', '') var = re.sub(search,pat,savedvar) if var in variants: continue
illegal = False
for p in var.split():
if 'mp' in p and 'Np' in word:
illegal = True
break
if p.startswith(('Gk','R','N','T','L','Y','nj')):
illegal = True
break
if p.endswith(('Rn','Rt','Rl')):
illegal = True
break
if not illegal:
variants.add(var)
normalisedVariants = []
for var in variants:
var = mapToOrigChar(var)
illegal = False
for substring in ('ngrn','ngrt','ngrl''gh','gg','gy','hnh','hlh','hth','hr','yh','nyrt','nrt','nrl','nrn','lrt','lrn','lrn','trt','trl','trn','rrr','lynh','lylh','lyth','nynh','nylt','nyth','tynh','tylh','tyth','rlth','rlnh','rllh','rnth','rnnh','rnlh','rtth','rtnh','rtlh','rtrt','rtrl','rtrn','rlrt','rlrl','rlrn','rnrt','rnrl','rnrn','rng','nhg','nyg','rth','rnh','thj','rnh','rny','rlh','rly','yy','hh','hy'):
if substring in var:
illegal = True
break
if illegal:
break
for regex in (r'[^aiur]r[^aiu]',r'rr[lnt][^aiu]',r'([^r])\1',r'[^aiu]rr',r'[^aiur][rh][tln]'):
if re.search(regex,var):
illegal = True
break
if illegal:
break
if not illegal:
normalisedVariants.append(var)
return normalisedVariants
def standardise(word):
# avoiding use of re.sub where possible
word = re.sub(r'[!,?\"]', r' ', word)
word = word.replace(' ', ' ')
if word.endswith(' '):
word = word[:-1]
if word.startswith(' '):
word = word[1:]
word = re.sub(r'\'$',r'', word)
word = word.replace(u'ā', 'a')
word = word.replace(u'ē', 'i')
word = word.replace(u'ī', 'i')
word = word.replace(u'ō', 'o')
word = word.replace(u'ū', 'u')
word = word.replace('rngu', 'rnku')
if word.endswith('ur'):
word = word+'r'
word = word.replace('ur ', 'urr ')
if word.endswith('ir'):
word = word+'r'
word = word.replace('ir ', 'irr ')
word = word.replace('aioo', 'ayu')
word = word.replace('aiu', 'ayu')
word = word.replace('aua', 'awa')
word = word.replace('aue', 'awa')
word = word.replace('uai', 'uwayi')
word = word.replace('ooi', 'uwi')
word = word.replace('aui', 'awuyi')
word = word.replace('ngg', 'ngk')
word = word.replace('a-i', 'ayi')
word = word.replace('e-i', 'ayi')
if word.startswith('i'):
word = 'y'+word
if word.startswith('u'):
word = 'w'+word
word = word.replace(' u',' wu')
word = word.replace(' i',' yi')
word = word.replace('ee', 'i')
word = word.replace('aia', 'aya')
word = word.replace('ai', 'ayi')
word = word.replace('ia', 'iya')
word = word.replace('ea', 'iya')
word = word.replace('ei', 'ayi')
word = word.replace('iu', 'iwu')
word = word.replace('au', 'awu')
if word.endswith('ow'):
word = word[:-2]+'awu'
if word.endswith('aw'):
word = word[:-2]+'awu'
word = re.sub(r'ow([^aeiou])', r'awu\1', word)
word = re.sub(r'aw([^aeiou])', r'awu\1', word)
word = word.replace('aa', 'a')
word = re.sub(r'ah([^aeiou])', r'a\1', word)
if word.endswith('ah'):
word = word[:-1]
if word.endswith('ar'):
word = word[:-1]
word = word.replace('ah ', 'a ')
word = word.replace('uh', 'a')
word = word.replace('ar ', 'a ')
if word.endswith('er'):
word = word[:-2]+'a'
word = word.replace('er ', 'a ')
if word.endswith('en'):
word = word[:-2]+'in'
word = word.replace('en ', 'in ')
if word.endswith('el'):
word = word[:-2]+'il'
word = word.replace('el ', 'il ')
word = word.replace('el', 'al')
word = word.replace('ooa', 'uwa')
word = word.replace('oo', 'u')
word = word.replace('uu', 'u')
word = word.replace('ii', 'i')
word = word.replace('ua', 'uwa')
word = word.replace('ui', 'uwi')
word = word.replace('oa', 'uwa')
# no double letters except for /rr/
word = re.sub(r'([^r])\1', r'\1', word)
word = word.replace(u'ŋ', 'ng')
word = word.replace('yny', 'ny')
word = word.replace('yn', 'ny')
word = word.replace('yly', 'ly')
word = word.replace('yl', 'ly')
word = word.replace('nhnh','nh')
word = word.replace('ngng','ng')
word = word.replace('nyny','ny')
word = word.replace('rnrn','rn')
word = word.replace('rlrl','rl')
word = word.replace('lhlh','lh')
word = word.replace('lyly','ly')
word = word.replace('rtrt','rt')
word = word.replace('thth','th')
if word.endswith('y'):
word = word[:-1]+'ayi'
word = word.replace('y ', 'ayi ')
if word.startswith('g'):
word = 'k'+word[1:]
word = word.replace(' g', ' k')
word = word.replace('b', 'p')
word = word.replace('d', 't')
word = word.replace('o', 'u')
word = word.replace('e', 'i')
word = re.sub(r'([^n])g', r'\1k', word)
#word = word.replace('lj', 'ly')
word = word.replace('tch', 'j')
word = word.replace('sch', 'j')
word = word.replace('ty', 'j')
word = word.replace('tj', 'j')
word = word.replace('sj', 'j')
word = word.replace('sth', 'j')
word = word.replace('s', 'j')
word = word.replace('c', 'k')
word = word.replace('wh', 'w')
word = re.sub(r'([^aiu ])wu', r'\1u', word)
word = re.sub(r'([^aiu ])w([aiu])', r'\1uw\2', word)
word = re.sub(r'(^| )([^aiur ])r([aiu])', r'\1\2ur\3', word)
word = re.sub(r'([^aiunl])y([^iua])', r'\1ayi\2', word)
word = re.sub(r'ay([^iua])', r'ayi\1', word)
word = re.sub(r'([^nltaiu ])y', r'\1i', word)
word = re.sub(r'([^aiu])rr', r'\1r', word)
word = re.sub(r'[\'-_]', r'', word)
return word
def main(): parser = ArgumentParser(description = doc, formatter_class = RawTextHelpFormatter) parser.add_argument('-c', '--chargroupfile', help = 'character groups file, each line contains tab separated sets of characters') parser.add_argument('-v', '--verbose', dest='verbose', help='prints a list of original word and variant forms', action="store_true") parser.add_argument('xmldir', help = 'directory containing XML files') opts = parser.parse_args()
charGroups = readCharGroupFile(opts.chargroupfile)
xmlCount = 0
inDir = opts.xmldir
outDir = os.path.join(inDir,'processed')
if not os.path.exists(outDir):
os.makedirs(outDir)
# Going through input directory
for filename in os.listdir(inDir):
fullpath = os.path.join(inDir,filename)
if filename.lower().endswith('.xml'):
print >> sys.stderr, 'Processing %s...' %(filename)
# parsing xml
tree = ET.parse(open(fullpath, 'r'))
xmlroot = tree.getroot()
for textelement in xmlroot.iter():
if textelement.tag.endswith('text'):
for child in textelement.iter():
targets = []
if child.tag.endswith('term') and child.text: # Finding 'term' elements
# Remove existing target elements - this isn't working...
for i in range(len(child)):
child.remove(child[0])
# making standardised spelling
origword = child.text
origlower = child.text.lower()
# clearing text and making new orig element
child.text = ''
# keep standardising until output doesn't change
standard = standardise(origlower)
while True:
if standardise(standard) == standard:
break
standard = standardise(standard)
targets.append(standard)
# making variants
for w in variant(standard,charGroups):
if w != standard and w not in targets:
targets.append(w)
# making /a/ variant if there's a /u/
if re.search(r'u[^ ]',origlower) and not re.search(r'ur[tdnl]',origlower):
uVariant = re.sub(r'u([^ ])',r'a\1',origlower)
while True:
if standardise(uVariant) == uVariant:
break
uVariant = standardise(uVariant)
for w in variant(uVariant,charGroups):
if w not in targets:
targets.append(w)
# Amending XML tree
# making choice element to contain all spellings
choiceelement = ET.SubElement(child,'choice')
# making orig element
newelement = ET.SubElement(choiceelement,'orig')
newelement.text = origword
# making corr element for standard
newelement = ET.SubElement(choiceelement,'corr')
newelement.text = standard
# making reg elements for all other variants
if len(targets) > 1:
for w in targets:
if opts.verbose:
print origlower+'\t'+w
if w != standard:
newelement = ET.SubElement(choiceelement,'reg')
newelement.text = w
# Generating new XML trees in /processed directory
outfile = os.path.join(outDir,filename)
xmlstr = minidom.parseString(ET.tostring(xmlroot)).toprettyxml(indent='',newl='')
with open (outfile, 'w') as f:
f.write(xmlstr.encode('utf-8')).
xmlCount += 1
print >> sys.stderr, '%s XML files processed.' %(xmlCount)
if name == 'main': main()
Amy wants to be able to search the repository for alternate spellings and get all hits.
Two ways to implement:
Rulesets - index all words starting with K as also starting with G (for example) to be able to search on both spellings and get the same result. Is there a ruleset that applies across languages that makes sense? Does the ruleset need to be limited to a given language?
Give the user the ability to markup alternate spellings in situ: