issue with resuming from checkpoint

slala2121 commented 9 months ago

How do I resume from a checkpoint in a separate run?

I've tried the following (per the example here: https://pymoo.org/misc/checkpoint.html?highlight=checkpoint) but the results do not match those without checkpoint (see code below).

import numpy as np

from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.core.problem import ElementwiseProblem
from pymoo.optimize import minimize
from pymoo.visualization.scatter import Scatter

import pymoo
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.operators.crossover.pntx import TwoPointCrossover
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from pymoo.optimize import minimize

from pymoo.core.problem import Problem, ElementwiseProblem

from pymoo.util.running_metric import RunningMetric
from pymoo.termination.max_gen import MaximumGenerationTermination
from pymoo.core.evaluator import Evaluator
from pymoo.core.population import Population
from pymoo.problems.static import StaticProblem

import matplotlib.pyplot as plt
import os
import dill

import dill
from pymoo.problems import get_problem

from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.optimize import minimize
from pymoo.termination.max_gen import MaximumGenerationTermination

problem = get_problem("zdt1", n_var=5)

algorithm = NSGA2(pop_size=100)

res_5 = minimize(problem,
               algorithm,
               ('n_gen', 5),
               seed=1,
               copy_algorithm=True,
               verbose=True,
               save_history=True)

with open("/scratch/network/slala/exp/mvts/rct/dummy/res_5", "wb") as f:
    dill.dump(res_5, f)

res_5_orig=res_5

problem = get_problem("zdt1", n_var=5)

algorithm = NSGA2(pop_size=100)

res_2 = minimize(problem,
               algorithm,
               ('n_gen', 2),
               seed=1,
               copy_algorithm=True,
               verbose=True,
               save_history=True)

with open("/scratch/network/slala/exp/mvts/rct/dummy/res_2", "wb") as f:
    dill.dump(res_2, f)

with open("/scratch/network/slala/exp/mvts/rct/dummy/res_2", "rb") as f:
    res_2=dill.load(f)

res_5 = minimize(problem,
               res_2.algorithm,
               ('n_gen', 5),
               seed=1,
               copy_algorithm=True,
               verbose=True,
               save_history=True)

assert np.allclose(res_5.X,res_5_orig.X)

blankjul commented 9 months ago

If you want them to match even the random seed, you have to use the tutorial just provided below the on you referred to: https://pymoo.org/misc/checkpoint.html?highlight=checkpoint#Object-Oriented

Please let me know if this makes it work.

slala2121 commented 9 months ago

Thanks for your quick reply. I followed the object-oriented example and am still seeing issues.

Here is what I tried:

import dill
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.problems import get_problem

problem = get_problem("zdt1", n_var=5)

algorithm = NSGA2(pop_size=100)

algorithm.setup(problem, seed=1, termination=('n_gen', 5), verbose=True)

for k in range(5):
    algorithm.next()
    print(algorithm.n_gen)

    with open("checkpoint_%d"%k, "wb") as f:
        dill.dump(algorithm, f)

# test resuming from the 2nd generation
with open("checkpoint_2", 'rb') as f:
    checkpoint = dill.load(f)
    print("Loaded Checkpoint:", checkpoint)

while checkpoint.has_next():
    checkpoint.next()
    print(checkpoint.n_gen)

# compare against ground truth
with open("checkpoint_5", 'rb') as f:
    checkpoint_final = dill.load(f)
    print("Loaded Checkpoint:", checkpoint_final)

assert np.allclose(checkpoint_final.result().X,checkpoint.result().X)

The assertion fails -- see attached output for specifics.

out.txt err.txt

blankjul commented 9 months ago

I was looking into this issue and forget what I said in my last answer. The issue is not the algorithm object itself, but indeed the random seed. In fact this is related to #469 where we discussed if an algorithm should own a random generator (than the results would match).

I have coded up an example to show it is indeed the random seed. You can fix it in the beginning of each iteration which will create identical runs then (the code is not pretty though I admit)

import dill
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.problems import get_problem
import numpy as np

problem = get_problem("zdt1", n_var=5)

algorithm = NSGA2(pop_size=100)

algorithm.setup(problem, seed=1, termination=('n_gen', 5), verbose=True)

while algorithm.has_next():
    i = algorithm.n_gen
    if i is None:
        i = 0
    np.random.seed(i)

    algorithm.next()
    with open("checkpoint_%d" % i, "wb") as f:
        dill.dump(algorithm, f)

with open("checkpoint_2", 'rb') as f:
    checkpoint = dill.load(f)
    print("Loaded Checkpoint:", checkpoint)

    while checkpoint.has_next():
        i = int(checkpoint.n_gen)
        np.random.seed(i)

        checkpoint.next()

slala2121 commented 8 months ago

Thanks. I'll follow up in case there are further issues.

anyoptimization / pymoo

issue with resuming from checkpoint #558