Cleaning signup sheets and other human entered email addresses.
Here's the code I use that may be helpful to y'all. (It's a bit messy.) It uses email-validator and pydash (because I'm sad I don't get to write in node.js).
from email_validator import validate_email, EmailNotValidError, EmailSyntaxError, EmailUndeliverableError
import re
from pydash import predicates
from pydash.strings import trim, reg_exp_replace, clean, deburr
from typing import List
from pydash.collections import every, filter_
from pydash.arrays import flatten_deep
def empty_if_null(value: str) -> str:
return value if value else ""
def trim_non_printing(value: str) -> str:
value = trim(value)
value = reg_exp_replace(value, '[\u202a\u25a0\u00a0\s]+$', '')
value = reg_exp_replace(value, '^[\u202a\u25a0\u00a0\s]+', '')
return value
def clean_email_string(value: str) -> str:
if not predicates.is_string(value):
return ""
# lowercase everything
value = trim_non_printing(clean(deburr(value)))
# strip spaces in the middle of the address
value = reg_exp_replace(value, r'\s+', '')
return value
email_display_name_re = re.compile(r".+\<(?P<email>[^@]+@[^\>]+)\>")
def fix_common_email_problems(value: str) -> str:
if email_display_name_re.match(value):
components = email_display_name_re.search(value)
value = components.group('email')
value = clean_email_string(value)
# trim off the start or end: , . : " > < '
# then trim whitespace again
value = trim(trim(value, ',.:"><\''))
# fix common suffix issues (could do a better job with this though...)
value = reg_exp_replace(value, r',com$', '.com')
return value
def clean_emails(email: str) -> List[str]:
def _clean_emails(email: str, already_fixed: bool) -> List[any]:
try:
return [validate_email(email)['email'].lower()]
except EmailNotValidError as e:
msg = str(e)
if 'It must have exactly one @-sign' in msg:
print(f'try splitting or {email} with {email.count("@")} @ signs')
for delim in [';', '/', ',', '|']:
# if email is split by this delimiter, do we end up with one @ in each set?
# if so, split on that delimiter and treat each as their own address in need
# of cleaning.
if every(email.split(delim), lambda x: x.count('@') == 1):
print(f'the delimiter is {delim}')
return list(map(lambda x: _clean_emails(x, False), email.split(delim)))
print("Can't figure out what delimiter it is so lets just try cleaning in otherways")
if not already_fixed:
return _clean_emails(fix_common_email_problems(email), True)
print(f'Giving up, {email} is probably just a really bad address due to {msg}')
return []
results = _clean_emails(email, False)
return filter_(flatten_deep(results))
Cleaning signup sheets and other human entered email addresses.
Here's the code I use that may be helpful to y'all. (It's a bit messy.) It uses email-validator and pydash (because I'm sad I don't get to write in node.js).