ahmedfgad / TorchGA

Train PyTorch Models using the Genetic Algorithm with PyGAD
https://pygad.readthedocs.io
93 stars 15 forks source link

Wrong results when when parallel processing is used. #5

Closed ahmedfgad closed 1 year ago

ahmedfgad commented 1 year ago

PyTorch gives wrong results when used with parallel processing. This is similar to this issue: https://github.com/ahmedfgad/GeneticAlgorithmPython/issues/145

import torch
import numpy
import concurrent.futures
import copy

numpy.random.seed(1)

def create_rand_weights(model, num_models):
    random_model_weights = []
    for model_idx in range(num_models):
        weights_dict = model.state_dict()
        for key in weights_dict:

            w_matrix = weights_dict[key].cpu().detach().numpy()
            layer_weights_shape = w_matrix.shape

            if len(layer_weights_shape) > 1:
                layer_weights = numpy.random.rand(layer_weights_shape[0], layer_weights_shape[1])
            else:
                layer_weights = numpy.random.rand(layer_weights_shape[0])
            weights_dict[key] = torch.from_numpy(layer_weights)
        random_model_weights.append(weights_dict)

    return random_model_weights

def model_error(model_weights):
    global data_inputs, data_outputs, model

    # _model = copy.deepcopy(model)
    # _model.load_state_dict(model_weights)

    # predictions = _model(data_inputs)

    model.load_state_dict(model_weights)

    predictions = model(data_inputs)

    abs_error = loss_function(predictions, data_outputs).detach().numpy() + 0.00000001

    return abs_error

input_layer = torch.nn.Linear(3, 2)
relu_layer = torch.nn.ReLU()
output_layer = torch.nn.Linear(2, 1)

model = torch.nn.Sequential(input_layer,
                            relu_layer,
                            output_layer)

loss_function = torch.nn.L1Loss()

data_inputs = torch.tensor([[0.02, 0.1, 0.15],
                           [0.7, 0.6, 0.8],
                           [1.5, 1.2, 1.7],
                           [3.2, 2.9, 3.1]])    
data_outputs = torch.tensor([[0.1],
                            [0.6],
                            [1.3],
                            [2.5]])

num_models = 10
random_model_weights = create_rand_weights(model, num_models)

ExecutorClass = concurrent.futures.ThreadPoolExecutor
thread_output = []
with ExecutorClass(max_workers=2) as executor:
    output = executor.map(model_error, random_model_weights)
for out in output:
    thread_output.append(out)
thread_output=numpy.array(thread_output)
print("Wrong Outputs using Threads")
print(thread_output)

print("\n\n")

correct_output = []
for idx in range(num_models):
    error = model_error(random_model_weights[idx])
    correct_output.append(error)
correct_output=numpy.array(correct_output)
print("Correct Outputs without Threads")
print(correct_output)

print(correct_output - thread_output)
ahmedfgad commented 1 year ago

The issue is solved by copying the model before making predictions. https://stackoverflow.com/a/75606666/5426539

import torch
import numpy
import concurrent.futures
import copy

numpy.random.seed(1)

def create_rand_weights(model, num_models):
    random_model_weights = []
    for model_idx in range(num_models):
        weights_dict = model.state_dict()
        for key in weights_dict:

            w_matrix = weights_dict[key].cpu().detach().numpy()
            layer_weights_shape = w_matrix.shape

            if len(layer_weights_shape) > 1:
                layer_weights = numpy.random.rand(layer_weights_shape[0], layer_weights_shape[1])
            else:
                layer_weights = numpy.random.rand(layer_weights_shape[0])
            weights_dict[key] = torch.from_numpy(layer_weights)
        random_model_weights.append(weights_dict)

    return random_model_weights

def model_error(model_weights):
    global data_inputs, data_outputs, model
    _model = copy.deepcopy(model)
    _model.load_state_dict(model_weights)

    predictions = _model(data_inputs)

    abs_error = loss_function(predictions, data_outputs).detach().numpy() + 0.00000001

    return abs_error

input_layer = torch.nn.Linear(3, 2)
relu_layer = torch.nn.ReLU()
output_layer = torch.nn.Linear(2, 1)

model = torch.nn.Sequential(input_layer,
                            relu_layer,
                            output_layer)

loss_function = torch.nn.L1Loss()

data_inputs = torch.tensor([[0.02, 0.1, 0.15],
                           [0.7, 0.6, 0.8],
                           [1.5, 1.2, 1.7],
                           [3.2, 2.9, 3.1]])    
data_outputs = torch.tensor([[0.1],
                            [0.6],
                            [1.3],
                            [2.5]])

num_models = 10
random_model_weights = create_rand_weights(model, num_models)

ExecutorClass = concurrent.futures.ThreadPoolExecutor
thread_output = []
with ExecutorClass(max_workers=2) as executor:
    output = executor.map(model_error, random_model_weights)
for out in output:
    thread_output.append(out)
thread_output=numpy.array(thread_output)
print("Wrong Outputs using Threads")
print(thread_output)

print("\n\n")

correct_output = []
for idx in range(num_models):
    error = model_error(random_model_weights[idx])
    correct_output.append(error)
correct_output=numpy.array(correct_output)
print("Correct Outputs without Threads")
print(correct_output)

print(correct_output - thread_output)