Official PyTorch implementation of Adventurer, linear-time image models with causal modeling paradigm.
Arxiv: https://arxiv.org/pdf/2410.07599
Model | Input Size | IN-1k Top-1 Acc. | Checkpoint |
---|---|---|---|
Adventurer-Tiny | 224 | 78.2 | Adventurer_tiny_patch16_224 |
Adventurer-Small | 224 | 81.8 | Adventurer_small_patch16_224 |
Adventurer-Base | 224 | 82.6 | Adventurer_base_patch16_224 |
Adventurer-Large | 224 | 83.4 | Adventurer_large_patch16_224 |
Adventurer-Base/P8 | 224 | 83.9 | Adventurer_base_patch8_224 |
Adventurer-Base | 384 | 84.2 | Adventurer_base_patch16_384 |
Adventurer-Base | 448 | 84.3 | Adventurer_base_patch16_448 |
Adventurer-Base/P8 | 448 | 84.8 | Adventurer_base_patch8_448 |
conda create -n adventurer python=3.10
source activate adventurer
pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
pip install timm==0.4.12 mlflow==2.9.1 setuptools==69.5.1 wandb submitit
pip install causal-conv1d==1.2.1
pip install mamba-ssm==2.0.4
python -m torch.distributed.launch --nproc_per_node=1 --use_env main.py \
--model adventurer_base_patch16 --input-szie 224 \
--data-path /PATH/TO/IMAGENET --batch 128 \
--resume /PATH/TO/CHECKPOINT \
--eval --eval-crop-ratio 0.875
python -m torch.distributed.launch --nproc_per_node=1 --use_env main.py \
--model adventurer_base_patch8 --input-szie 448 \
--data-path /PATH/TO/IMAGENET --batch 128 \
--resume /PATH/TO/CHECKPOINT \
--eval --eval-crop-ratio 1.0
python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \
--model adventurer_base_patch16 \
--data-path /PATH/TO/IMAGENET \
--batch 128 --lr 5e-4 --weight-decay 0.05 \
--output_dir ./output/adventurer_base_patch16_224/s1_128 \
--reprob 0.0 --smoothing 0.0 --repeated-aug --ThreeAugment \
--epochs 300 --input-size 128 --drop-path 0.1 --dist-eval
python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \
--model adventurer_base_patch16 \
--data-path /PATH/TO/IMAGENET \
--batch 128 --lr 5e-4 --weight-decay 0.05 \
--finetune ./output/adventurer_base_patch16_224/s1_128/checkpoint.pth
--output_dir ./output/adventurer_base_patch16_224/s2_224 \
--reprob 0.0 --smoothing 0.0 --repeated-aug --ThreeAugment \
--epochs 100 --input-size 224 --drop-path 0.4 --dist-eval
python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \
--model adventurer_base_patch16 \
--data-path /PATH/TO/IMAGENET \
--batch 64 --lr 1e-5 --weight-decay 0.1 --unscale-lr \
--finetune ./output/adventurer_base_patch16_224/s2_224/checkpoint.pth
--output_dir ./output/adventurer_base_patch16_224/s3_224 \
--reprob 0.0 --smoothing 0.1 --no-repeated-aug --aa rand-m9-mstd0.5-inc1 \
--epochs 20 --input-size 224 --drop-path 0.6 --dist-eval
@article{wang2024causal,
title={Causal Image Modeling for Efficient Visual Understanding},
author={Wang, Feng and Yang, Timing and Yu, Yaodong and Ren, Sucheng and Wei, Guoyizhe and Wang, Angtian and Shao, Wei and Zhou, Yuyin and Yuille, Alan and Xie, Cihang},
journal={arXiv preprint arXiv:2410.07599},
year={2024}
}