google-research / scenic

Scenic: A Jax Library for Computer Vision Research and Beyond
Apache License 2.0
3.34k stars 441 forks source link

AttributeError: 'list' object has no attribute 'items' #627

Open parhameftekhar opened 1 year ago

parhameftekhar commented 1 year ago

I'm trying to train the MBT model on my own dataset. I get the following error. Any help is appreciated.

Traceback (most recent call last): File "/home/eftekhar/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/eftekhar/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/eftekhar/models/scenic/scenic/projects/mbt/main.py", line 49, in app.run(main=main) File "/home/eftekhar/anaconda3/lib/python3.9/site-packages/scenic/app.py", line 65, in run app.run(functools.partial(_run_main, main=main)) File "/home/eftekhar/anaconda3/lib/python3.9/site-packages/absl/app.py", line 308, in run _run_main(main, args) File "/home/eftekhar/anaconda3/lib/python3.9/site-packages/absl/app.py", line 254, in _run_main sys.exit(main(argv)) File "/home/eftekhar/anaconda3/lib/python3.9/site-packages/scenic/app.py", line 100, in _run_main main(rng=rng, config=FLAGS.config, workdir=FLAGS.workdir, writer=writer) File "/home/eftekhar/models/scenic/scenic/projects/mbt/main.py", line 39, in main trainer.train( File "/home/eftekhar/anaconda3/lib/python3.9/site-packages/scenic/projects/mbt/trainer.py", line 425, in train gflops) = mbt_train_utils.initialize_model( File "/home/eftekhar/anaconda3/lib/python3.9/site-packages/scenic/projects/mbt/train_utils.py", line 83, in initialize_model for modality_name, spec in input_spec.items(): AttributeError: 'list' object has no attribute 'items'

The config file also is:

r"""Multimodal sound classification on the balanced (mini) AudioSet.

"""

import ml_collections

AUDIOSET_TRAIN_SIZE = 20361

def get_config(): """Returns the base experiment configuration.""" config = ml_collections.ConfigDict() config.experiment_name = 'mbt_balanced_audioset_classification'

config.dataset_configs = ml_collections.ConfigDict() config.dataset_configs.base_dir = '/home/eth/tfrecords_builder/tmp/generated_dataset' config.dataset_configs.tables = { 'train': 'train', 'validation': 'valid', 'test': 'test', } config.dataset_configs.examples_per_subset = { 'train': 4189, 'validation': 898, 'test': 898 } config.dataset_configs.num_classes = 20 config.data_dtype_str = 'float32' config.dataset_name = 'video_tfrecord_dataset' config.dataset_configs.modalities = ('spectrogram', 'rgb') config.dataset_configs.return_as_dict = False config.dataset_configs.num_frames = 32 config.dataset_configs.stride = 2 config.dataset_configs.num_spec_frames = 8 config.dataset_configs.spec_stride = 1

config.dataset_configs.spec_mean = 1.102 config.dataset_configs.spec_stddev = 2.762

config.dataset_configs.min_resize = 256 config.dataset_configs.crop_size = 224 config.dataset_configs.spec_shape = (100, 128)

config.dataset_configs.one_hot_labels = True config.dataset_configs.zero_centering = True

config.dataset_configs.do_multicrop_test = True config.dataset_configs.log_test_epochs = 4 config.dataset_configs.num_test_clips = 4 config.dataset_configs.test_batch_size = 8 # Needs to be num_local_devices config.multicrop_clips_per_device = 2

config.dataset_configs.augmentation_params = ml_collections.ConfigDict() config.dataset_configs.augmentation_params.do_jitter_scale = True config.dataset_configs.augmentation_params.scale_min_factor = 0.9 config.dataset_configs.augmentation_params.scale_max_factor = 1.33 config.dataset_configs.augmentation_params.prob_scale_jitter = 1.0 config.dataset_configs.augmentation_params.do_color_augment = True config.dataset_configs.augmentation_params.prob_color_augment = 0.8 config.dataset_configs.augmentation_params.prob_color_drop = 0.1

config.dataset_configs.prefetch_to_device = 2

config.dataset_configs.spec_augment = True config.dataset_configs.spec_augment_params = ml_collections.ConfigDict() config.dataset_configs.spec_augment_params.freq_mask_max_bins = 48 config.dataset_configs.spec_augment_params.freq_mask_count = 1 config.dataset_configs.spec_augment_params.time_mask_max_frames = 48 config.dataset_configs.spec_augment_params.time_mask_count = 4 config.dataset_configs.spec_augment_params.time_warp_max_frames = 1.0 config.dataset_configs.spec_augment_params.time_warp_max_ratio = 0 config.dataset_configs.spec_augment_params.time_mask_max_ratio = 0

config.model_name = 'mbt_multilabel_classification' config.model = ml_collections.ConfigDict() config.model.modality_fusion = ('spectrogram', 'rgb') config.model.use_bottleneck = True config.model.test_with_bottlenecks = True config.model.share_encoder = False config.model.n_bottlenecks = 4 config.model.fusion_layer = 8 config.model.hidden_size = 768 config.model.patches = ml_collections.ConfigDict() config.model.attention_config = ml_collections.ConfigDict() config.model.attention_config.type = 'spacetime' config.model.num_heads = 12 config.model.mlp_dim = 3072 config.model.num_layers = 12 config.model.representation_size = None config.model.classifier = 'gap' config.model.attention_dropout_rate = 0. config.model.dropout_rate = 0. config.model_dtype_str = 'float32'

config.model.temporal_encoding_config = ml_collections.ConfigDict() config.model.temporal_encoding_config.method = '3d_conv' config.model.patches.size = [16, 16, 2] config.model.temporal_encoding_config.kernel_init_method = 'central_frame_initializer' config.model.temporal_encoding_config.n_sampled_frames = 4 # Unused here.

config.trainer_name = 'mbt_trainer' config.optimizer = 'momentum' config.optimizer_configs = ml_collections.ConfigDict() config.l2_decay_factor = 0 config.max_grad_norm = 1 config.label_smoothing = 0.3 config.num_training_epochs = 50 config.batch_size = 64 config.rng_seed = 0 config.mixup = ml_collections.ConfigDict() config.mixup.alpha = 0.5 config.mixmod = False config.model.stochastic_droplayer_rate = 0.3

config.init_from = ml_collections.ConfigDict() config.init_from.model_config = None disable=line-too-long config.init_from.checkpoint_path = '/home/eth/models/scenic/scenic/projects/mbt/vit_base' config.init_from.checkpoint_format = 'scenic' config.init_from.model_config = ml_collections.ConfigDict() config.init_from.model_config.model = ml_collections.ConfigDict() config.init_from.model_config.model.classifier = 'token' # Specify if this is 'token' or 'gap'. pylint: disable=line-too-long config.init_from.restore_positional_embedding = True config.init_from.restore_input_embedding = True config.init_from.positional_embed_size_change = 'resize_tile'

steps_per_epoch = AUDIOSET_TRAIN_SIZE // config.batch_size total_steps = config.num_training_epochs steps_per_epoch config.lr_configs = ml_collections.ConfigDict() config.lr_configs.learning_rate_schedule = 'compound' config.lr_configs.factors = 'constant cosine_decay linear_warmup' config.lr_configs.warmup_steps = 2.5 steps_per_epoch config.lr_configs.steps_per_cycle = total_steps config.lr_configs.base_learning_rate = 5e-1

config.write_summary = True config.checkpoint = True # Do checkpointing. config.debug_train = False # Debug mode during training. config.debug_eval = False # Debug mode during eval. config.checkpoint_steps = 500 # Checkpoint more frequently than a val epoch. return config

huangfei00 commented 10 months ago

I encountered the same problem as you did. Did you manage to solve it?

huangfei00 commented 10 months ago

emmm..... Also, for the preprocessing of the Audioset dataset, could you provide some reference code?

parhameftekhar commented 10 months ago

@huangfei00 No I gave up