Closed kalvinchang closed 2 years ago
Note: I generated every possible tone realization of every ending with a Python script.
This script errs on the side of overgenerating.
I did not include the syllabic consonants (m and ng) in here and instead added those manually
endings = """
a,a
e,e
i,i
o,ə
ong,ɔŋ
om,ɔm
op,ɔp
ok,ɔk
oo,ɔ
u,u
ai,ai̯
au,au̯
ia,i̯a
ing,iə̯ŋ
ik,iə̯k
ian,i̯ɛn
iat,i̯ɛt
io,i̯o
iong,i̯ɔŋ
iu,i̯u
ua,u̯a
ue,u̯e
ui,u̯i
iau,i̯au̯
uai,u̯ai̯
ann,ã
enn,ẽ
inn,ĩ
onn,ɔ̃
ainn,ãĩ̯
iann,ĩ̯ã
ionn,ĩ̯ɔ̃
iunn,ĩ̯ũ
uann,ũ̯ã
uinn,ũ̯ĩ
iaunn,ĩ̯ãũ̯
uainn,ũ̯ãĩ̯
"""
vowels = { 'a', 'e', 'i', 'o', 'u' }
# use combining diacritics, in line with MOEDict
tone_map = {
'a': 'a, á, à, a, â, ǎ, ā, a̍, a̋',
'e': 'e, é, è, e, ê, ě, ē, e̍, e̋',
'i': 'i, í, ì, i, î, ǐ, ī, i̍, i̋',
'o': 'o, ó, ò, o, ô, ǒ, ō, o̍, ő',
'u': 'u, ú, ù, u, û, ǔ, ū, u̍, ű'
}
# in order, from 1 to 9
tones_ipa = [
'˦˦',
'˥˩',
'˧˩',
'˧˨', # 4th tone needs to be recognized separately because it has no tone marker
'˨˦',
'˥˩',
'˧˧',
'˦',
'˧˥'
]
with open("poss.txt", "w") as f:
endings = endings.split()
poss_endings = set([ending.split(',')[0] for ending in endings])
for ending in endings:
(orth, ipa) = ending.split(',')
vowel_idx = -1
# identify the vowel
# in a diphthong or triphthong, tone is always on the 2nd vowel
# except for oo
if orth == 'oo':
# the first o
vowel_idx = 0
else:
prev_c = ''
which_vowel = 0
for idx, c in enumerate(orth):
# stop when the only vowel or the second vowel seen
if c not in vowels or which_vowel == 2:
vowel_idx = idx - 1
prev_c = c
break
else:
which_vowel += 1
prev_c = c
# entire string is a diphthong
if prev_c in vowels and vowel_idx == -1:
vowel_idx = len(orth) - 1
# exception: diphthong begins with a - tone marked on the a
if orth[0] == 'a':
vowel_idx = 0
tone_vowel = orth[vowel_idx]
for tone_num, tone_tailo in enumerate(tone_map[tone_vowel].split(', ')):
new_ortho = ''.join([(c if i != vowel_idx else tone_tailo) for i, c in enumerate(orth)])
tone_ipa = tones_ipa[tone_num]
if orth[-1] in {'h','p','t','k'}:
if tone_num + 1 in {4,8}: # only 4th and 8th tones for entries ending in stops
f.write(new_ortho + "," + ipa + tone_ipa + "\n")
else:
# ones ending in h,p,t,k should only have 4th and 8th tones
continue
elif tone_num + 1 in {4,8}: # but doesn't end in p, t, k
if orth[-1] in vowels or orth[-2:] == 'nn':
if orth + "h" not in poss_endings:
f.write(new_ortho + "h" + "," + ipa + "ʔ" + tone_ipa + "\n")
# need to add the h -> ʔ mapping
# 4th tone -> also generate p,t,k
# skip the 4th tone for finals with consonant endings and non-nasalized endings
# 'ep','et','ek' are written with i in tailo
if orth[-1] in vowels and orth != 'e':
if orth + "p" not in poss_endings:
f.write(new_ortho + "p" + "," + ipa + "p" + tone_ipa + "\n")
if orth + "t" not in poss_endings:
f.write(new_ortho + "t" + "," + ipa + "t" + tone_ipa + "\n")
if orth + "k" not in poss_endings:
f.write(new_ortho + "k" + "," + ipa + "k" + tone_ipa + "\n")
else:
f.write(new_ortho + "," + ipa + tone_ipa + "\n")
# generate all 9 tones
# a,a˦˦
# á,a˥˩
# à,a˧˩
# ah,aʔ˧˨
# â,a˨˦
# ā,a˧˧
# a̍,aʔ˦
# 9th: ˧˥
The alternative solution to including every possible tone realization in the map file is to use a rule to use context sensitive rules during pre/postprocessing with the tone marker because Epitran uses NFD and breaks down é, for example, into e and ◌́.
forgot to note -
Wiktionary includes ◌̚ (unreleased stop) for -p,-t,-k, but I did not distinguish this
for posterity,
I did some digging and found that the MOE Dictionary actually allows you to switch between Tai-lo and POJ Romanization and do they this deterministic mapping on the front end: https://github.com/g0v/moedict-webkit/blob/35af25fe8f086eec666a4c1057ae4af739447b4c/view.ls
however, the original data is in Tai-lo (https://github.com/g0v/moedict-webkit/blob/master/ENG-README.md), so I will stick with Tai-lo for now
Updated Tailo generation script
endings = """
a,a
e,e
i,i
o,ə
ong,ɔŋ
om,ɔm
op,ɔp
ok,ɔk
oo,ɔ
u,u
ai,ai̯
au,au̯
ia,i̯a
ing,iə̯ŋ
ik,iə̯k
ian,i̯ɛn
iat,i̯ɛt
io,i̯o
iong,i̯ɔŋ
iu,i̯u
ua,u̯a
ue,u̯e
ui,u̯i
iau,i̯au̯
uai,u̯ai̯
ann,ã
enn,ẽ
inn,ĩ
onn,ɔ̃
ainn,ãĩ̯
iann,ĩ̯ã
ionn,ĩ̯ɔ̃
iunn,ĩ̯ũ
uann,ũ̯ã
uinn,ũ̯ĩ
iaunn,ĩ̯ãũ̯
uainn,ũ̯ãĩ̯
"""
vowels = { 'a', 'e', 'i', 'o', 'u' }
# use combining diacritics, in line with MOEDict
tone_map = {
'a': 'a, á, à, a, â, ǎ, ā, a̍, a̋',
'e': 'e, é, è, e, ê, ě, ē, e̍, e̋',
'i': 'i, í, ì, i, î, ǐ, ī, i̍, i̋',
'o': 'o, ó, ò, o, ô, ǒ, ō, o̍, ő',
'u': 'u, ú, ù, u, û, ǔ, ū, u̍, ű'
}
# in order, from 1 to 9
tones_ipa = [
'˥',
'˥˩',
'˧˩',
'˧', # 4th tone needs to be recognized separately because it has no tone marker
'˨˦',
'˥˩',
'˧',
'˥',
'˧˥'
]
with open("poss.txt", "w") as f:
endings = endings.split()
poss_endings = set([ending.split(',')[0] for ending in endings])
for ending in endings:
(orth, ipa) = ending.split(',')
vowel_idx = -1
# identify the vowel
# in a diphthong or triphthong, tone is always on the 2nd vowel
# except for oo
if orth == 'oo':
# the first o
vowel_idx = 0
else:
prev_c = ''
which_vowel = 0
for idx, c in enumerate(orth):
# stop when the only vowel or the second vowel seen
if c not in vowels or which_vowel == 2:
vowel_idx = idx - 1
prev_c = c
break
else:
which_vowel += 1
prev_c = c
# entire string is a diphthong
if prev_c in vowels and vowel_idx == -1:
vowel_idx = len(orth) - 1
# exception: diphthong begins with a - tone marked on the a
if orth[0] == 'a':
vowel_idx = 0
tone_vowel = orth[vowel_idx]
for tone_num, tone_tailo in enumerate(tone_map[tone_vowel].split(', ')):
new_ortho = ''.join([(c if i != vowel_idx else tone_tailo) for i, c in enumerate(orth)])
tone_ipa = tones_ipa[tone_num]
if orth[-1] in {'h','p','t','k'}:
if tone_num + 1 in {4,8}: # only 4th and 8th tones for entries ending in stops
f.write(new_ortho + "," + ipa + tone_ipa + "\n")
else:
# ones ending in h,p,t,k should only have 4th and 8th tones
continue
elif tone_num + 1 in {4,8}: # but doesn't end in p, t, k
if orth[-1] in vowels or orth[-2:] == 'nn':
if orth + "h" not in poss_endings:
f.write(new_ortho + "h" + "," + ipa + "ʔ" + tone_ipa + "\n")
# need to add the h -> ʔ mapping
# 4th tone -> also generate p,t,k
# skip the 4th tone for finals with consonant endings and non-nasalized endings
# 'ep','et','ek' are written with i in tailo
if orth[-1] in vowels and orth != 'e':
if orth + "p" not in poss_endings:
f.write(new_ortho + "p" + "," + ipa + "p" + tone_ipa + "\n")
if orth + "t" not in poss_endings:
f.write(new_ortho + "t" + "," + ipa + "t" + tone_ipa + "\n")
if orth + "k" not in poss_endings:
f.write(new_ortho + "k" + "," + ipa + "k" + tone_ipa + "\n")
else:
f.write(new_ortho + "," + ipa + tone_ipa + "\n")
# generate all 9 tones
# a,a˦˦
# á,a˥˩
# à,a˧˩
# ah,aʔ˧˨
# â,a˨˦
# ā,a˧˧
# a̍,aʔ˦
# 9th: ˧˥
note: 2cb38af in #119 fixes some issues with Tai-lo (e.g. the entry for j in the mapping table was not correct)
Adding support for Hokkien
Sources:
I choose to use the tâi-lô romanization because it is the one used by Taiwan's Ministry of Education (moedict.tw)
Conversion from POJ romanization (pe̍h-uē-jī, 白話字) to tâi-lô is straightforward, deterministic, and requires a few changes (all of which are on the Wikipedia page, I think)
I also stick with the prestigious Tainan dialect (e.g. ko 高 -> [kə] instead of [ko] from the Taipei dialect). The Wikipedia article uses
<kor>
to distinguish this, but I do not think the Ministry of Education does this: https://www.moedict.tw/'%E9%AB%98