akshaynagpal / w2n

Convert number words (eg. twenty one) to numeric digits (21)
http://w2n.readthedocs.io
MIT License
167 stars 78 forks source link

My wrapper code #73

Open quentinjs opened 1 year ago

quentinjs commented 1 year ago

I figured I'd contribute back, this has worked on 5,000 legal documents from many sources. Here is a wrapper I wrote to help supportmy W2N.. Some of this is related to typos in the original documents or badly scanned.

in my case I don't care about pennies so I removed the 100/xx type text. hope this helps someone.

def word2value(val): tens = [ 'twenty', 'thirty', 'forty', 'fifty','sixty', 'seventy', 'eighty', 'ninety']

my_name = sys._getframe().f_code.co_name

if val is not None and len(val) > 0:
    val = re.sub("\$,", "", val).lower()

    # remove 00/100 or other values
    val = re.sub(r' AND \d+\/100(ths|th)?', '', val, flags=re.IGNORECASE)
    val = re.sub(r' & \d+\/100(ths|th)?', '', val, flags=re.IGNORECASE)
    val = re.sub(r' AND NO\/100?', '', val, flags=re.IGNORECASE)

    # e.g. four hundred AND two
    if val.isnumeric(): #or val.find('and') != -1:
        return val

    val = re.sub("^venty", "seventy", val, flags=re.IGNORECASE)

    val = re.sub("^irty", "thirty", val, flags=re.IGNORECASE)
    val = re.sub("^fty", "fifty", val, flags=re.IGNORECASE)

    val = re.sub("^neteen", "nineteen", val, flags=re.IGNORECASE)
    val = re.sub("^fteen", "fifteen", val, flags=re.IGNORECASE)

    val = re.sub("eightfen", "eighteen", val, flags=re.IGNORECASE)
    val = re.sub("^ghteen", "eighteen", val, flags=re.IGNORECASE)
    val = re.sub(" ghteen", "eighteen", val, flags=re.IGNORECASE)

    val = re.sub("sixten", "sixteen", val, flags=re.IGNORECASE)

    val = re.sub("^iwo ", "two ", val, flags=re.IGNORECASE)
    val = re.sub("^o ", "two ", val, flags=re.IGNORECASE)

    val = re.sub("^ven", "seven", val, flags=re.IGNORECASE)
    val = re.sub("^even", "seven", val, flags=re.IGNORECASE)
    val = re.sub("^ve", "five", val, flags=re.IGNORECASE)

    val = re.sub("^x", "six", val, flags=re.IGNORECASE)

    val = re.sub("elght", "eight", val, flags=re.IGNORECASE)
    val = re.sub("light", "eight", val, flags=re.IGNORECASE)
    val = re.sub("^ght", "eight", val, flags=re.IGNORECASE)

    val = re.sub("^n ", "ten ", val, flags=re.IGNORECASE)

    val = re.sub("^nety", "ninety", val, flags=re.IGNORECASE)
    val = re.sub("^elve", "twelve", val, flags=re.IGNORECASE)

    # fix hyphenated words so they include the hyphen
    for t in tens:
        while True:
            m = val.find(t)
            dash_pos = m + len(t)
            if m != -1 and len(val) > dash_pos and val[dash_pos:dash_pos+1].isalpha():
                val = val[:dash_pos ] + '-' + val[dash_pos:]
            else:
                break

    try:
        val = str(w2n.word_to_num(val))
    except ValueError as e:
        pass
        #print(my_name, ": value to be converted:", val)

return val