Closed YTW0518 closed 3 years ago
@YTW0518
The image size is fixed to 224. You need to resize the image to 224 in the dataset.
fully_train:
dataset:
type: CutsomClassificationDataset
common:
train_portion: 1.0
train:
batch_size: 96
shuffle: True
transforms:
- type: Resize
size: [256, 256]
- type: RandomCrop
size: [224, 224]
- type: RandomHorizontalFlip
- type: ToTensor
- type: Normalize
mean:
- 0.49139968
- 0.48215827
- 0.44653124
std:
- 0.24703233
- 0.24348505
- 0.26158768
val:
batch_size: 96
shuffle: False
transforms:
- type: Resize
size: [224, 224]
- type: ToTensor
- type: Normalize
mean:
- 0.49139968
- 0.48215827
- 0.44653124
std:
- 0.24703233
- 0.24348505
- 0.26158768
test:
batch_size: 96
shuffle: False
transforms:
- type: Resize
size: [224, 224]
- type: ToTensor
- type: Normalize
mean:
- 0.49139968
- 0.48215827
- 0.44653124
std:
- 0.24703233
- 0.24348505
- 0.26158768
Thanks for your reply. However, the image size has already been fixed to 224 before, as the "cars.yml" shows. The change is that I changed the number of cells in the template "darts_imagenet.json" from 14 to 20, and the problem of feature mismatch appears. There is no problem when number of cells is 14.
File "/home/wyt/.local/lib/python3.7/site-packages/torch/nn/functional.py", line 1370, in linear ret = torch.addmm(bias, input, weight.t()) RuntimeError: size mismatch, m1: [64 x 12288], m2: [768 x 21] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:290
general:
backend: pytorch # pytorch
pipeline: [nas, fully_train]
nas: pipe_step: type: SearchPipeStep
dataset:
type: ClassificationDataset
common:
data_path: /home/wyt/dataset/out2
train_portion: 0.5
num_workers: 8
drop_last: False
train:
shuffle: True
batch_size: 6
transforms:
- type: Resize
size: [256, 256]
- type: RandomCrop
size: [224, 224]
- type: RandomHorizontalFlip
- type: ToTensor
- type: Normalize
mean:
- 0.4842
- 0.4901
- 0.4505
std:
- 0.1734
- 0.1635
- 0.1554
val:
batch_size: 64
shuffle: False
transforms:
- type: Resize
size: [224, 224]
- type: ToTensor
- type: Normalize
mean:
- 0.4842
- 0.4901
- 0.4505
std:
- 0.1734
- 0.1635
- 0.1554
test:
batch_size: 64
shuffle: False
transforms:
- type: Resize
size: [224, 224]
- type: ToTensor
- type: Normalize
mean:
- 0.4842
- 0.4901
- 0.4505
std:
- 0.1734
- 0.1635
- 0.1554
search_algorithm:
type: CARSAlgorithm
policy:
num_individual: 12
start_ga_epoch: 50
ga_interval: 20
select_method: uniform
warmup: 50
search_space:
type: SearchSpace
modules: ['super_network']
super_network:
type: CARSDartsNetwork
stem:
type: PreOneStem
init_channels: 8
stem_multi: 3
head:
type: LinearClassificationHead
init_channels: 8
num_classes: 21
auxiliary: False
search: True
cells:
modules: [
'normal', 'normal', 'reduce',
'normal', 'normal', 'reduce',
'normal', 'normal'
]
normal:
type: NormalCell
steps: 4
genotype:
[
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 2, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 2, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 3, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 3, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 3, 2 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 2 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 3 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 2 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 3 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 4 ],
]
concat: [2, 3, 4, 5]
reduce:
type: ReduceCell
steps: 4
genotype:
[
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 2, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 2, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 3, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 3, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 3, 2 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 2 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 4, 3 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 0 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 1 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 2 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 3 ],
[ ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'], 5, 4 ],
]
concat: [2, 3, 4, 5]
trainer:
type: Trainer
darts_template_file: "{default_darts_imagenet_template}"
callbacks: CARSTrainerCallback
epochs: 500
optimizer:
type: SGD
params:
lr: 0.025
momentum: 0.9
weight_decay: !!float 3e-4
lr_scheduler:
type: CosineAnnealingLR
params:
T_max: 500
eta_min: 0.001
grad_clip: 5.0
seed: 10
unrolled: True
loss:
type: CrossEntropyLoss
params:
sparse: True
fully_train: pipe_step: type: TrainPipeStep models_folder: "/home/wyt/tasks/0521.224449.324/output/nas/" trainer: ref: nas.trainer epochs: 600 lr_scheduler: type: CosineAnnealingLR params: T_max: 600.0 eta_min: 0 loss: type: MixAuxiliaryLoss params: loss_base: type: CrossEntropyLoss aux_weight: 0.4 seed: 100 drop_path_prob: 0.2 evaluator: type: Evaluator host_evaluator: type: HostEvaluator metric: type: accuracy dataset: ref: nas.dataset common: train_portion: 1 train: batch_size: 256 shuffle: True transforms:
type: Normalize
mean:
- 0.4842
- 0.4901
- 0.4505
std:
- 0.1734
- 0.1635
- 0.1554
val:
batch_size: 256
shuffle: False
transforms:
type: Normalize mean:
0.1554
test: batch_size: 256 shuffle: False transforms:
2. [darts_imagenet.json]
{ "modules": [ "super_network" ], "super_network": { "type": "DartsNetwork", "input_size": 224, "init_channels": 48, "num_classes": 21, "auxiliary": true, "aux_size": 8, "auxiliary_layer": 13, "drop_path_prob": 0.2, "search": false, "stem": { "type": "PreTwoStem", "init_channels": 48 }, "head": { "type": "LinearClassificationHead" }, "cells": { "modules": [ "normal", "normal", "normal", "normal", "normal", "normal", "reduce", "normal", "normal", "normal", "normal", "normal", "normal", "reduce", "normal", "normal", "normal", "normal", "normal", "normal" ], "normal": { "type": "NormalCell", "steps": 4, "genotype": [ [ "skip_connect", 2, 0 ], [ "skip_connect", 2, 1 ], [ "sep_conv_3x3", 3, 0 ], [ "sep_conv_3x3", 3, 1 ], [ "sep_conv_3x3", 4, 1 ], [ "sep_conv_3x3", 4, 0 ], [ "sep_conv_3x3", 5, 0 ], [ "sep_conv_3x3", 5, 1 ] ], "concat": [ 2, 3, 4, 5 ] }, "reduce": { "type": "ReduceCell", "steps": 4, "genotype": [ [ "sep_conv_3x3", 2, 0 ], [ "sep_conv_3x3", 2, 1 ], [ "sep_conv_3x3", 3, 0 ], [ "sep_conv_3x3", 3, 1 ], [ "sep_conv_3x3", 4, 0 ], [ "sep_conv_3x3", 4, 1 ], [ "sep_conv_3x3", 5, 0 ], [ "sep_conv_3x3", 5, 1 ] ], "concat": [ 2, 3, 4, 5 ] } } } }
@YTW0518
Found a bug:
{
"modules": [
"super_network"
],
"super_network": {
"type": "DartsNetwork",
"input_size": 224,
"init_channels": 48,
"num_classes": 21,
"auxiliary": true,
"aux_size": 8, <---- here, the correct value is 7
"auxiliary_layer": 13,
"drop_path_prob": 0.2,
"search": false,
Thanks for your timely and effective reply!!! It works! ^-^
Pleasure! :)
Hello, when I modify the cell numbers to 20 in "darts_imagenet.json" as the "darts_cifar10.json", at the beginning of the fully training stage, the problem of feature mismatch appears as follows:
2021-05-21 23:37:36.51 INFO ------------------------------------------------ 2021-05-21 23:37:36.59 INFO ------------------------------------------------ 2021-05-21 23:37:36.59 INFO Step: fully_train 2021-05-21 23:37:36.59 INFO ------------------------------------------------ 2021-05-21 23:37:36.62 INFO init TrainPipeStep... 2021-05-21 23:37:36.62 INFO TrainPipeStep started... 2021-05-21 23:37:52.708 INFO Model was created. 2021-05-21 23:37:57.248 ERROR Failed to run pipeline. 2021-05-21 23:37:57.253 ERROR Traceback (most recent call last): File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/pipeline/pipeline.py", line 69, in run PipeStep().do() File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/pipeline/train_pipe_step.py", line 50, in do self._train_multi_models(records) File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/pipeline/train_pipe_step.py", line 121, in _train_multi_models self._train_single_model(record.desc, record.worker_id, weights_file) File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/pipeline/train_pipe_step.py", line 95, in _train_single_model self._do_single_fully_train(trainer) File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/pipeline/train_pipe_step.py", line 114, in _do_single_fully_train self._train_single_gpu_model(trainer) File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/pipeline/train_pipe_step.py", line 99, in _train_single_gpu_model self.master.run(trainer, evaluator) File "/home/wyt/.local/lib/python3.7/site-packages/vega/core/scheduler/local_master.py", line 47, in run worker.train_process() File "/home/wyt/.local/lib/python3.7/site-packages/zeus/trainer/trainer_base.py", line 133, in train_process self._train_loop() File "/home/wyt/.local/lib/python3.7/site-packages/zeus/trainer/trainer_base.py", line 308, in _train_loop self._train_epoch() File "/home/wyt/.local/lib/python3.7/site-packages/zeus/trainer/trainer_torch.py", line 102, in _train_epoch train_batch_output = self.train_step(batch) File "/home/wyt/.local/lib/python3.7/site-packages/zeus/trainer/trainer_torch.py", line 155, in _default_train_step output = self.model(input) File "/home/wyt/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in call result = self.forward(*input, kwargs) File "/home/wyt/.local/lib/python3.7/site-packages/zeus/modules/operators/functions/pytorch_fn.py", line 128, in forward return self.call(inputs, *args, *kwargs) File "/home/wyt/.local/lib/python3.7/site-packages/zeus/networks/super_network.py", line 95, in call logits_aux = self.auxiliary_head(s1) File "/home/wyt/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in call result = self.forward(input, kwargs) File "/home/wyt/.local/lib/python3.7/site-packages/zeus/modules/operators/functions/pytorch_fn.py", line 128, in forward return self.call(inputs, *args, *kwargs) File "/home/wyt/.local/lib/python3.7/site-packages/zeus/modules/operators/functions/pytorch_fn.py", line 106, in call output = model(output) File "/home/wyt/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in call result = self.forward(input, **kwargs) File "/home/wyt/.local/lib/python3.7/site-packages/zeus/modules/operators/functions/pytorch_fn.py", line 395, in forward out = super().forward(x) File "/home/wyt/.local/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 87, in forward return F.linear(input, self.weight, self.bias) File "/home/wyt/.local/lib/python3.7/site-packages/torch/nn/functional.py", line 1370, in linear ret = torch.addmm(bias, input, weight.t()) RuntimeError: size mismatch, m1: [64 x 12288], m2: [768 x 21] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:290
From this point of view, 768 is fixed in the current framework, but when we make changes, where is it? Looking forward to your reply, thank you very very much!!!
(1) darts_imagenet.json