14x word filtering speed improvements using regex

Hello!

I'm glad I wasn't alone in wanting to build a solver for letter-boxed. For the initial wordlist filtering, you can use regular expressions to improve speed by 14-15x here's an example with a profiler to compare speeds.

import cProfile
import regex as re

def regex_approach(word_file):
    top = ['T','C','R']
    right = ['I','U','O']
    left = ['K','M','N']
    bottom = ['S','A','L']
    letters = {
        'top':top,
        'bottom':bottom,
        'left': left,
        'right': right
        }

    _top = "".join(top)
    _left = "".join(left)
    _right = "".join(right)
    _bottom = "".join(bottom)

    all_letters = [y for x in letters.values() for y in x]
    ALLLETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    letter_combos = "".join(all_letters)
    # We don't actually need BADLETTERS, we can just do not (^) letter_combos, 
    # but this was a bit more demonstrative at the cost of performance
    BADLETTERS = re.sub(f'[{letter_combos}]','',ALLLETTERS)
    # we don't split the word into a list, we can just do fast regex instead
    # step 1: remove words with bad letters -- re.multiline ensures we treat newlines as new lines.
    _words = re.sub(rf'^[A-Z]*[{BADLETTERS}\']{{1,}}[A-Z]*$','', words, flags=re.MULTILINE)
    valid_word_pattern = rf'^[A-Z]*([{_top}][{_top}]|[{_left}][{_left}]|[{_bottom}][{_bottom}]|[{_right}][{_right}])[A-Z]*$'
    # step 2: remove words that violate the same-side rule
    _words = re.sub(valid_word_pattern,'',_words,flags=re.MULTILINE)
    # N.B. I'm pretty sure we can combine these two regexes somehow, which will be even more performant, but it started to get ugly
    # remove blanks
    _words = re.sub(f'\n+','\n', _words)
    return _words.split()

def existing_approach(word_file):
    top = ['T','C','R']
    right = ['I','U','O']
    left = ['K','M','N']
    bottom = ['S','A','L']

    pos = {
        'top':top,
        'bottom':bottom,
        'left': left,
        'right': right
        }

    chars = set(pos.keys())

    actual_words = sorted(set(list(word.strip().upper() for word in word_file)))
    valid_words = [w for w in actual_words if set(w)-chars==set()]
    toss = []
    for word in valid_words:
        ## make sure letters aren't adjacent and don't repeat
        letters = list(word)
        num = 1
        while num < len(letters):
            if (pos[letters[num]] == pos[letters[num-1]]):
                toss.append(word)
                num = len(letters)
            else:
                num += 1
    return [w for w in valid_words if w not in toss]

if __name__ == "__main__":
    with open('./words_hard.txt','r') as f1:
        words = f1.read()
    profiler = cProfile.Profile()
    profiler.enable()

    #existing_approach(words)
    # Easy : 2354840 function calls in 0.622 seconds
    # Hard: 8830337 function calls in 2.383 seconds

    regex_approach(words)
    # Easy : 3963 function calls (3910 primitive calls) in 0.044 seconds, 14x faster
    # Hard: 3963 function calls (3910 primitive calls) in 0.150 seconds, 15x faster

    profiler.disable()
    profiler.print_stats(sort='cumulative')

I'm still poking at trying to find a much faster solver that solves it in n>=2 words. What I have so far is only marginally better -- still a lot of iterating through lists and using regex positive lookaheads.

However, for one word solutions, we can use the original, un-split word file directly and get a faster result with lookarounds.

def find_one_word_solution(all_letters: list,_words: str):
    combo = [fr'(?=.*{letter})' for letter in all_letters]
    _pattern = r'\b'+''.join(combo) + r'\w*\b'
    return re.findall(_pattern,_words)

I just wanted to share this for fun, but if you want me to submit a pull request for any of this, let me know.

aliceyliang / letter-boxed-solver

14x word filtering speed improvements using regex #6