Open NotSure2732 opened 2 years ago
Let's narrow down the cause of your issue.
One more suggestion here: please format your issue description with markdown syntax properly. Otherwise, your code is not human-readable.
this problem is resolve in v2.10 : "Fixed the racing issue in RL strategy on submitting models."
Describe the issue: I tried running the the following models space with PolicyBasedRL and I will also put in the experiment configuration:
BASELINE NAS USING v2.7
from nni.retiarii.serializer import model_wrapper import torch.nn.functional as F import nni.retiarii.nn.pytorch as nn
class Block1(nn.Module): def init(self, layer_size): super().init() self.conv1 = nn.Conv2d(3, layer_size2, 3, stride=1,padding=1) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(layer_size2, layer_size*8, 3, stride=1, padding=1)
class Block2(nn.Module): def init(self, layer_size): super().init() self.conv1 = nn.Conv2d(3, layer_size, 3, stride=1,padding=1) self.conv2 = nn.Conv2d(layer_size, layer_size2, 3, stride=1,padding=1) self.pool = nn.MaxPool2d(2, 2) self.conv3 = nn.Conv2d(layer_size2, layer_size*8, 3, stride=1,padding=1)
class Block3(nn.Module): def init(self, layer_size): super().init() self.conv1 = nn.Conv2d(3, layer_size, 3, stride=1,padding=1) self.conv2 = nn.Conv2d(layer_size, layer_size2, 3, stride=1,padding=1) self.pool = nn.MaxPool2d(2, 2) self.conv3 = nn.Conv2d(layer_size2, layer_size4, 3, stride=1,padding=1) self.conv4 = nn.Conv2d(layer_size4, layer_size*8, 3, stride=1, padding=1)
@model_wrapper class Net(nn.Module): def init(self): super().init() rand_var = nn.ValueChoice([32,64]) self.conv1 = nn.LayerChoice([Block1(rand_var),Block2(rand_var),Block3(rand_var)]) self.conv2 = nn.Conv2d(rand_var8,rand_var16 , 3, stride=1, padding=1) self.fc1 = nn.Linear(rand_var168*8, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10)
model = Net()
from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
exp = RetiariiExperiment(model, trainer, [], RL_strategy)
exp_config = RetiariiExeConfig('local') exp_config.experiment_name = '5%_RL_10_epochs_64_batch' exp_config.trial_concurrency = 2 exp_config.max_trial_number = 100
exp_config.trial_gpu_number = 2
exp_config.max_experiment_duration = '660m' exp_config.execution_engine = 'base' exp_config.training_service.use_active_gpu = False
--> This led to the following error:
[2022-04-24 23:49:22] ERROR (nni.runtime.msg_dispatcher_base/Thread-5) 3 Traceback (most recent call last): File "/Users/sh/opt/anaconda3/lib/python3.7/site-packages/nni/runtime/msg_dispatcher_base.py", line 88, in command_queue_worker self.process_command(command, data) File "/Users/sh/opt/anaconda3/lib/python3.7/site-packages/nni/runtime/msg_dispatcher_base.py", line 147, in process_command command_handlerscommand File "/Users/sh/opt/anaconda3/lib/python3.7/site-packages/nni/retiarii/integration.py", line 170, in handle_report_metric_data self._process_value(data['value'])) File "/Users/sh/opt/anaconda3/lib/python3.7/site-packages/nni/retiarii/execution/base.py", line 111, in _intermediate_metric_callback model = self._running_models[trial_id] KeyError: 3
What does this error mean/ why does it occur/ how can I fix it?
Thanks for your help!
Environment:
Configuration:
Log message:
How to reproduce it?: