move-coop / parsons

A python library of connectors for the progressive community.
https://www.parsonsproject.org/
Other
261 stars 132 forks source link

Email validation/parsing #91

Open mklaber opened 4 years ago

mklaber commented 4 years ago

Cleaning signup sheets and other human entered email addresses.

Here's the code I use that may be helpful to y'all. (It's a bit messy.) It uses email-validator and pydash (because I'm sad I don't get to write in node.js).

from email_validator import validate_email, EmailNotValidError, EmailSyntaxError, EmailUndeliverableError
import re
from pydash import predicates
from pydash.strings import trim, reg_exp_replace, clean, deburr
from typing import List
from pydash.collections import every, filter_
from pydash.arrays import flatten_deep

def empty_if_null(value: str) -> str:
    return value if value else ""

def trim_non_printing(value: str) -> str:
    value = trim(value)
    value = reg_exp_replace(value, '[\u202a\u25a0\u00a0\s]+$', '')
    value = reg_exp_replace(value, '^[\u202a\u25a0\u00a0\s]+', '')
    return value

def clean_email_string(value: str) -> str:
    if not predicates.is_string(value):
        return ""
    # lowercase everything
    value = trim_non_printing(clean(deburr(value)))
    # strip spaces in the middle of the address
    value = reg_exp_replace(value, r'\s+', '')
    return value

email_display_name_re = re.compile(r".+\<(?P<email>[^@]+@[^\>]+)\>")

def fix_common_email_problems(value: str) -> str:
    if email_display_name_re.match(value):
        components = email_display_name_re.search(value)
        value = components.group('email')
    value = clean_email_string(value)
    # trim off the start or end: ,  .  :  "  >  <  '
    # then trim whitespace again
    value = trim(trim(value, ',.:"><\''))
    # fix common suffix issues (could do a better job with this though...)
    value = reg_exp_replace(value, r',com$', '.com')
    return value

def clean_emails(email: str) -> List[str]:
    def _clean_emails(email: str, already_fixed: bool) -> List[any]:
        try:
            return [validate_email(email)['email'].lower()]
        except EmailNotValidError as e:
            msg = str(e)
            if 'It must have exactly one @-sign' in msg:
                print(f'try splitting or {email} with {email.count("@")} @ signs')
                for delim in [';', '/', ',', '|']:
                    # if email is split by this delimiter, do we end up with one @ in each set?
                    # if so, split on that delimiter and treat each as their own address in need 
                    # of cleaning.
                    if every(email.split(delim), lambda x: x.count('@') == 1):
                        print(f'the delimiter is {delim}')
                        return list(map(lambda x: _clean_emails(x, False), email.split(delim)))
                print("Can't figure out what delimiter it is so lets just try cleaning in otherways")
            if not already_fixed:
                return _clean_emails(fix_common_email_problems(email), True)
            print(f'Giving up, {email} is probably just a really bad address due to {msg}')
            return []
    results = _clean_emails(email, False)
    return filter_(flatten_deep(results))
shaunagm commented 1 year ago

Seems related to #554