alibaba-mmai-research / TAdaConv

[ICLR 2022] TAda! Temporally-Adaptive Convolutions for Video Understanding. This codebase provides solutions for video classification, video representation learning and temporal detection.
https://tadaconv-iclr2022.github.io
Apache License 2.0
226 stars 31 forks source link

In Mosi, training on hmdb51 based on the pre trained checkpoint you provided cannot reproduce the results #21

Open TJQdoIt9527 opened 12 months ago

TJQdoIt9527 commented 12 months ago

The backbone is r2p1d, and the results on the test list you provided after training are as follows (decor=0.4.0, with added TEST SCALE=112, the pretrain checkpoint you provided):

[09/08 15:44:59][INFO] tadaconv.utils.checkpoint: 492: Load from the last checkpoint file: output/r2p1d_mosi_ft_hmdb_autor/checkpoints/checkpoint_epoch_00300.pyth [09/08 15:44:59][INFO] tadaconv.datasets.base.hmdb51: 37: Reading video list from file: hmdb51_test_list.txt [09/08 15:44:59][INFO] tadaconv.datasets.base.base_dataset: 172: Loading HMDB51 dataset list for split 'test'... [09/08 15:44:59][INFO] tadaconv.datasets.base.base_dataset: 197: Dataset HMDB51 split test loaded. Length 15300. [09/08 15:44:59][INFO] test: 215: Testing model for 55 iterations [09/08 15:45:55][INFO] tadaconv.utils.logging: 89: {"cur_iter": "5", "eta": "0:00:27", "split": "test_iter", "time_diff": 0.534744} [09/08 15:45:57][INFO] tadaconv.utils.logging: 89: {"cur_iter": "10", "eta": "0:00:22", "split": "test_iter", "time_diff": 0.498246} [09/08 15:46:23][INFO] tadaconv.utils.logging: 89: {"cur_iter": "15", "eta": "0:00:21", "split": "test_iter", "time_diff": 0.513669} [09/08 15:46:26][INFO] tadaconv.utils.logging: 89: {"cur_iter": "20", "eta": "0:00:18", "split": "test_iter", "time_diff": 0.519889} [09/08 15:46:45][INFO] tadaconv.utils.logging: 89: {"cur_iter": "25", "eta": "0:08:40", "split": "test_iter", "time_diff": 16.778454} [09/08 15:46:52][INFO] tadaconv.utils.logging: 89: {"cur_iter": "30", "eta": "0:00:12", "split": "test_iter", "time_diff": 0.497483} [09/08 15:46:54][INFO] tadaconv.utils.logging: 89: {"cur_iter": "35", "eta": "0:00:11", "split": "test_iter", "time_diff": 0.560603} [09/08 15:47:17][INFO] tadaconv.utils.logging: 89: {"cur_iter": "40", "eta": "0:00:08", "split": "test_iter", "time_diff": 0.508289} [09/08 15:47:20][INFO] tadaconv.utils.logging: 89: {"cur_iter": "45", "eta": "0:00:05", "split": "test_iter", "time_diff": 0.509557} [09/08 15:47:40][INFO] tadaconv.utils.logging: 89: {"cur_iter": "50", "eta": "0:00:08", "split": "test_iter", "time_diff": 1.383271} [09/08 15:47:43][INFO] tadaconv.utils.logging: 89: {"cur_iter": "55", "eta": "0:00:00", "split": "test_iter", "time_diff": 0.338351} [09/08 15:47:44][INFO] tadaconv.utils.logging: 89: {"split": "test_final", "top1_acc": "40.00", "top5_acc": "69.87"}

I have found that both the pre train checkpoint and finetuned checkpoint I trained myself are much larger than the checkpoint file you provided. My two files are 165M, while the two files you provided are both over 50M

Here are my training configurations:

[09/08 12:41:12][INFO] train: 336: Train with config: [09/08 12:41:12][INFO] train: 337: { "TASK_TYPE": "classification", "PRETRAIN": { "ENABLE": false }, "LOCALIZATION": { "ENABLE": false }, "TRAIN": { "ENABLE": true, "DATASET": "HMDB51", "BATCH_SIZE": 280, "LOG_FILE": "training_log.log", "EVAL_PERIOD": 5, "NUM_FOLDS": 30, "AUTO_RESUME": true, "CHECKPOINT_PERIOD": 10, "INIT": "", "CHECKPOINT_FILE_PATH": "/home/lzh/2022/tjq/TAdaConv/checkpoint/r2p1d_pt_hmdb_mosi_public.pyth", "CHECKPOINT_TYPE": "pytorch", "CHECKPOINT_INFLATE": false, "CHECKPOINT_PRE_PROCESS": { "ENABLE": false }, "FINE_TUNE": true, "ONLY_LINEAR": false, "LR_REDUCE": false, "TRAIN_VAL_COMBINE": false, "LOSS_FUNC": "cross_entropy" }, "TEST": { "ENABLE": true, "DATASET": "HMDB51", "BATCH_SIZE": 280, "NUM_SPATIAL_CROPS": 1, "SPATIAL_CROPS": "cc", "NUM_ENSEMBLE_VIEWS": 1, "LOG_FILE": "val.log", "CHECKPOINT_FILE_PATH": "", "CHECKPOINT_TYPE": "pytorch", "AUTOMATIC_MULTI_SCALE_TEST": true }, "VISUALIZATION": { "ENABLE": false, "NAME": "", "FEATURE_MAPS": { "ENABLE": false, "BASE_OUTPUT_DIR": "" } }, "SUBMISSION": { "ENABLE": false, "SAVE_RESULTS_PATH": "test.json" }, "DATA": { "DATA_ROOT_DIR": "/data/hmdb51/videos/", "ANNO_DIR": "/data1/hmdb51_annotations/hmdb51/", "NUM_INPUT_FRAMES": 16, "NUM_INPUT_CHANNELS": 3, "SAMPLING_MODE": "interval_based", "SAMPLING_RATE": 4, "TRAIN_JITTER_SCALES": [ 168, 224 ], "TRAIN_CROP_SIZE": 112, "TEST_SCALE": 112, "TEST_CROP_SIZE": 112, "MEAN": [ 0.45, 0.45, 0.45 ], "STD": [ 0.225, 0.225, 0.225 ], "MULTI_LABEL": false, "ENSEMBLE_METHOD": "sum", "TARGET_FPS": 30, "MINUS_INTERVAL": false, "FPS": 30 }, "MODEL": { "NAME": "R2Plus1D", "EMA": { "ENABLE": false, "DECAY": 0.99996 } }, "VIDEO": { "BACKBONE": { "DEPTH": 10, "META_ARCH": "ResNet3D", "NUM_FILTERS": [ 64, 64, 128, 256, 512 ], "NUM_INPUT_CHANNELS": 3, "NUM_OUT_FEATURES": 512, "KERNEL_SIZE": [ [ 3, 7, 7 ], [ 3, 3, 3 ], [ 3, 3, 3 ], [ 3, 3, 3 ], [ 3, 3, 3 ] ], "DOWNSAMPLING": [ true, false, true, true, true ], "DOWNSAMPLING_TEMPORAL": [ false, false, true, true, true ], "NUM_STREAMS": 1, "EXPANSION_RATIO": 2, "BRANCH": { "NAME": "R2Plus1DBranch" }, "STEM": { "NAME": "R2Plus1DStem" }, "NONLOCAL": { "ENABLE": false, "STAGES": [ 5 ], "MASK_ENABLE": false }, "INITIALIZATION": null }, "HEAD": { "NAME": "BaseHead", "ACTIVATION": "softmax", "DROPOUT_RATE": 0.5, "NUM_CLASSES": 51 } }, "OPTIMIZER": { "ADJUST_LR": false, "BASE_LR": 0.00075, "LR_POLICY": "cosine", "MAX_EPOCH": 300, "MOMENTUM": 0.9, "WEIGHT_DECAY": "1e-3", "WARMUP_EPOCHS": 10, "WARMUP_START_LR": 7.5e-05, "OPTIM_METHOD": "adam", "DAMPENING": 0.0, "NESTEROV": true, "BIAS_DOUBLE": false, "NEW_PARAMS": [], "NEW_PARAMS_MULT": 10, "NEW_PARAMS_WD_MULT": 1, "LAYER_WISE_LR_DECAY": 1.0, "COSINE_AFTER_WARMUP": false, "COSINE_END_LR": "1e-6" }, "BN": { "WB_LOCK": false, "FREEZE": false, "WEIGHT_DECAY": 0.0, "MOMENTUM": 0.1, "EPS": "1e-3", "SYNC": false }, "DATA_LOADER": { "NUM_WORKERS": 12, "PIN_MEMORY": false, "ENABLE_MULTI_THREAD_DECODE": true, "COLLATE_FN": null }, "NUM_GPUS": 4, "SHARD_ID": 0, "NUM_SHARDS": 1, "RANDOM_SEED": 0, "OUTPUT_DIR": "output/r2p1d_mosi_ft_hmdb_autor", "OUTPUT_CFG_FILE": "configuration.log", "LOG_PERIOD": 10, "DIST_BACKEND": "nccl", "LOG_MODEL_INFO": true, "LOG_CONFIG_INFO": true, "OSS": { "ENABLE": false, "KEY": null, "SECRET": null, "ENDPOINT": null, "CHECKPOINT_OUTPUT_PATH": null, "SECONDARY_DATA_OSS": { "ENABLE": false, "KEY": null, "SECRET": null, "ENDPOINT": null, "BUCKETS": [ "" ] } }, "AUGMENTATION": { "COLOR_AUG": true, "BRIGHTNESS": 0.5, "CONTRAST": 0.5, "SATURATION": 0.5, "HUE": 0.25, "GRAYSCALE": 0.3, "CONSISTENT": true, "SHUFFLE": true, "GRAY_FIRST": true, "RATIO": [ 0.857142857142857, 1.1666666666666667 ], "USE_GPU": false, "MIXUP": { "ENABLE": false, "ALPHA": 0.0, "PROB": 1.0, "MODE": "batch", "SWITCH_PROB": 0.5 }, "CUTMIX": { "ENABLE": false, "ALPHA": 0.0, "MINMAX": null }, "RANDOM_ERASING": { "ENABLE": false, "PROB": 0.25, "MODE": "const", "COUNT": [ 1, 1 ], "NUM_SPLITS": 0, "AREA_RANGE": [ 0.02, 0.33 ], "MIN_ASPECT": 0.3 }, "LABEL_SMOOTHING": 0.0, "SSV2_FLIP": false, "COLOR_P": 0.0, "AUTOAUGMENT": { "ENABLE": true, "BEFORE_CROP": true, "TYPE": "rand-m9-n4-mstd0.5-inc1" } }, "PAI": false, "USE_MULTISEG_VAL_DIST": false } I can't find where there is a problem with my training configuration. Can you provide your configuration file? Or could you give me a hint? Looking forward to your reply, thank you very much

huang-ziyuan commented 12 months ago

This is probably caused by the same problem as we previously discussed about the test-time cropping. We originally used a different function for resizing and cropping from the videos (see here). Now we have changed to KineticsResizedCrop. The logics for resizing and cropping are different, which might be the underlying reason for the performance mismatch. You could try modifying the code to match previous strategies, or change the configs for the current strategy to match the behaviour of our previous strategy.