Add chunk utils - Githubissues

woctezuma commented 1 year ago

Fix for:

8

This commit builds upon #11, and aims to break down the dictionary of patterns into chunks.

GillesVandewiele commented 1 year ago

def get_pattern(all_dictionary, guess_word):
    chunk_no = find_chunk_no(all_dictionary, guess_word)
    pattern_dict = load_pattern_dict(chunk_no)
    return pattern_dict[guess_word]

Would this line: pattern_dict = load_pattern_dict(chunk_no) not occur more overhead than it would gain? Loading the pickle from disk for every pattern calculation seems to be very expensive. Perhaps these chunks of the pattern_dict need to be stored in memory?

GillesVandewiele commented 1 year ago

I created a small benchmark script based on an older version of the code. I think we should re-run this on every branch and check the (avg/max) number of guesses and time required for 100 randomly sampled words (with a fixed seed).

import os
import itertools
import random
import pickle
import time
from tqdm import tqdm
from scipy.stats import entropy
from collections import defaultdict, Counter

import numpy as np
np.random.seed(42)

N_GUESSES = 10
DICT_FILE_all = 'all_words.txt'
DICT_FILE = 'words.txt'
SAVE_TIME = True

def calculate_pattern(guess, true):
    """Generate a pattern list that Wordle would return if you guessed
    `guess` and the true word is `true`
    Thanks to MarkCBell, Jeroen van der Hooft and gbotev1
    >>> calculate_pattern('weary', 'crane')
    (0, 1, 2, 1, 0)
    >>> calculate_pattern('meets', 'weary')
    (0, 2, 0, 0, 0)
    >>> calculate_pattern('rower', 'goner')
    (0, 2, 0, 2, 2)
    """
    wrong = [i for i, v in enumerate(guess) if v != true[i]]
    counts = Counter(true[i] for i in wrong)
    pattern = [2] * 5
    for i in wrong:
        v = guess[i]
        if counts[v] > 0:
            pattern[i] = 1
            counts[v] -= 1
        else:
            pattern[i] = 0
    return tuple(pattern)

def generate_pattern_dict(dictionary):
    """For each word and possible information returned, store a list
    of candidate words
    >>> pattern_dict = generate_pattern_dict(['weary', 'bears', 'crane'])
    >>> pattern_dict['crane'][(2, 2, 2, 2, 2)]
    {'crane'}
    >>> sorted(pattern_dict['crane'][(0, 1, 2, 0, 1)])
    ['bears', 'weary']
    """
    pattern_dict = defaultdict(lambda: defaultdict(set))
    for word in tqdm(dictionary):
        for word2 in dictionary:
            pattern = calculate_pattern(word, word2)
            pattern_dict[word][pattern].add(word2)
    return dict(pattern_dict)

def calculate_entropies(words, possible_words, pattern_dict, all_patterns):
    """Calculate the entropy for every word in `words`, taking into account
    the remaining `possible_words`"""
    entropies = {}
    for word in words:
        counts = []
        for pattern in all_patterns:
            matches = pattern_dict[word][pattern]
            matches = matches.intersection(possible_words)
            counts.append(len(matches))
        entropies[word] = entropy(counts)
    return entropies

def main():
    # load all 5-letter-words for making patterns 
    with open(DICT_FILE_all) as ifp:
        all_dictionary = list(map(lambda x: x.strip(), ifp.readlines()))

    # Load 2315 words for solutions
    with open(DICT_FILE) as ifp:
        dictionary = list(map(lambda x: x.strip(), ifp.readlines()))

    error_msg = 'Dictionary contains different length words.'
    assert len({len(x) for x in all_dictionary}) == 1, error_msg
    print(f'Loaded dictionary with {len(all_dictionary)} words...')
    WORD_LEN = len(all_dictionary[0]) # 5-letters 

    # Generate the possible patterns of information we can get
    all_patterns = list(itertools.product([0, 1, 2], repeat=WORD_LEN))

    # Calculate the pattern_dict and cache it, or load the cache.
    if 'pattern_dict.p' in os.listdir('.'):
        pattern_dict = pickle.load(open('pattern_dict.p', 'rb'))
    else:
        pattern_dict = generate_pattern_dict(all_dictionary)
        pickle.dump(pattern_dict, open('pattern_dict.p', 'wb+'))

    # Simulate games
    stats = defaultdict(list)
    words_to_guess = np.random.choice(
        dictionary,
        replace=False,
        size=100
    )
    for WORD_TO_GUESS in tqdm(words_to_guess):

        if SAVE_TIME:
            guess_word = 'tares'
            all_words = set(all_dictionary)
            info = calculate_pattern(guess_word, WORD_TO_GUESS)
            words = pattern_dict[guess_word][info]
            all_words = all_words.intersection(words)
            init_round = 1
        else:
            all_words = set(all_dictionary)
            init_round = 0

        for n_round in range(init_round, N_GUESSES):

            candidates = all_dictionary
            entropies = calculate_entropies(candidates, all_words, pattern_dict, all_patterns)

            if max(entropies.values()) < 0.1:
                candidates = all_words
                entropies = calculate_entropies(candidates, all_words, pattern_dict, all_patterns)

            # Guess the candiate with highest entropy
            guess_word = max(entropies.items(), key=lambda x: x[1])[0]
            info = calculate_pattern(guess_word, WORD_TO_GUESS)

            # Print round information
            if guess_word == WORD_TO_GUESS:
                # print(f'WIN IN {n_round + 1} GUESSES!\n\n\n')
                stats[WORD_TO_GUESS] = n_round + 1
                break

            # Filter our list of remaining possible words
            words = pattern_dict[guess_word][info]
            all_words = all_words.intersection(words)

        if WORD_TO_GUESS not in stats:
            stats[WORD_TO_GUESS] = -1

    return stats

if __name__ == "__main__":
    start = time.time()
    stats = main()
    end = time.time()
    print(f'Words not guessed: {sum(x < 0 for x in stats.values())}')
    print(f'Average number of guesses: {np.mean(list(stats.values()))}')
    print(f'Maximum number of guesses: {np.max(list(stats.values()))}')
    print(f'Time needed: {end - start}')

woctezuma commented 1 year ago

def get_pattern(all_dictionary, guess_word):
    chunk_no = find_chunk_no(all_dictionary, guess_word)
    pattern_dict = load_pattern_dict(chunk_no)
    return pattern_dict[guess_word]
Would this line: pattern_dict = load_pattern_dict(chunk_no) not occur more overhead than it would gain? Loading the pickle from disk for every pattern calculation seems to be very expensive. Perhaps these chunks of the pattern_dict need to be stored in memory?

Yes, I don't think my change is such a good idea: the files stored on disk are really big, and the script is really slow. The only advantage is that there is no crash due to a high usage of RAM when applying the script to Dungleon (which has more allowed guesses than Wordle).

I have made many other refactoring changes in my fork, but I don't think it would make sense to incorporate the chunk utils, unless the data is stored in a more compact way so that the script runs fast.

By the way, I have seen that there is some (very fast) supplementary code to the video mentioned in your README:

3Blue1Brown, Solving Wordle using information theory, posted on Youtube on February 6, 2022,
3b1b/videos: supplementary code (in Python) accompanying the aforementioned video,

woctezuma commented 1 year ago

For reference, this is the result of the clean-up of the supplementary code: https://github.com/woctezuma/3b1b-wordle-solver

GillesVandewiele / Wordle-Bot

Add chunk utils #12

8