Open DonaldTsang opened 5 years ago
First concepts
import unicodedata as ucd
import sys
def table(bits, name):
temp = {}
for i in range(sys.maxunicode): # each character
u = chr(i)
try:
name = ucd.name(u)
if ucd.combining(u) == 0 and ucd.bidirectional(u) not in ['R','AL'] and
ucd.category(u) not in ['Zs','Zl','Zp','Cc','Cf','Cs','Co','Cn']:
# disallow diacritics and Right-To-Left characters
# disallow spaces + control, formatters, surogates, PUAs and non-char
temp[i//(2**bits)][i%(2**bits)] = [name,
ucd.normalize('NFC',u) == u, ucd.normalize('NFKC',u) == u,
ucd.normalize('NFD',u) == u, ucd.normalize('NFKD',u) == u]
except:
continue
answer = []
for block in temp: # each block
if len(temp[block]) == 2**bits: # if the block is complete
answer.append([block, # the j-index itself
sum([temp[block][k][1] for k in temp[block]])==2**bits, # NFC
sum([temp[block][k][2] for k in temp[block]])==2**bits, # NFKC
sum([temp[block][k][3] for k in temp[block]])==2**bits, # NFD
sum([temp[block][k][4] for k in temp[block]])==2**bits]) # NFKD
return answer
a = table(8,'byte')
b = table(6,'b64')
c = table(5,'b32')
d = table(4,'balf')
I think this feature is going to be option. programmer can chose. And the table should compile only once or be dumped(pre compiled) (similar concept https://github.com/dahlia/iso4217/blob/master/iso4217/__init__.py#L18-L64)
@Parkayun some ideas:
See: https://github.com/qntm/base32768 and https://github.com/qntm/base2048
See https://github.com/qntm/safe-code-point and https://github.com/qntm/base65536gen but something in Python