Closed zhangron013 closed 3 months ago
看起来依赖版本似乎没有问题,确认下 EAT 仓库在 fairseq
文件夹下吗? 该函数会进入这个接口进行 forward 操作。
是的,EAT是在fairseq下,我是按照项目主页操作的:
git clone https://github.com/pytorch/fairseq
cd fairseq
pip install --editable ./
git clone https://github.com/cwx-worst-one/EAT
但似乎并没有按照预期进入这个接口。我在 fairseq/EAT/feature_extract/feature_extract.py(line 95)加了断点:
elif granularity == 'frame':
import pdb
pdb.set_trace()
feats = model.extract_features(source, padding_mask=None,mask=False, remove_extra_tokens=True)
feats = feats['x'].squeeze(0).cpu().numpy()
单步调试后发现并没有调用EAT_pretrain的extract_features方法。同时我打印了传入fairseq的参数,您看一下这里是否正确?
-> feats = model.extract_features(source, padding_mask=None,mask=False, remove_extra_tokens=True)
(Pdb) s
--Call--
> /dc-hl/luotao.zhang/project/wavcaps_test/captioning/fairseq/fairseq/models/fairseq_model.py(95)extract_features()
-> def extract_features(self, *args, **kwargs):
(Pdb) s
> /dc-hl/luotao.zhang/project/wavcaps_test/captioning/fairseq/fairseq/models/fairseq_model.py(97)extract_features()
-> return self(*args, **kwargs)
(Pdb) args
self = MaeImageClassificationModel(
(model): Data2VecMultiModel(
(modality_encoders): ModuleDict(
(IMAGE): ImageEncoder(
(local_encoder): PatchEmbed_new(
(proj): Conv2d(1, 1024, kernel_size=(16, 16), stride=(16, 16))
)
(project_features): Identity()
(fixed_positional_encoder): FixedPositionalEncoder()
(context_encoder): BlockEncoder(
(blocks): ModuleList()
(norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(dropout): Dropout(p=0.0, inplace=True)
)
(decoder): None
)
)
(dropout_input): Dropout(p=0.0, inplace=False)
(blocks): ModuleList(
(0): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): Identity()
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(1): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.004)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(2): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.009)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(3): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.013)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(4): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.017)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(5): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.022)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(6): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.026)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(7): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.030)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(8): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.035)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(9): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.039)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(10): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.043)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(11): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.048)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(12): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.052)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(13): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.057)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(14): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.061)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(15): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.065)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(16): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.070)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(17): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.074)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(18): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.078)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(19): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.083)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(20): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.087)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(21): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.091)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(22): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.096)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
(23): AltBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): AltAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath(drop_prob=0.100)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(norm): Identity()
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
(drop2): Dropout(p=0.0, inplace=False)
)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
)
)
)
(fc_norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(head): Linear(in_features=1024, out_features=527, bias=True)
)
args = (tensor([[[[-0.7271, -1.2596, -0.8827, ..., 0.7383, 0.6534, 0.6670],
[-0.6955, -1.0710, -0.6942, ..., 0.7360, 0.5563, 0.6051],
[-0.6170, -0.8173, -0.4405, ..., 0.6050, 0.5255, 0.5959],
...,
[ 0.4671, 0.4671, 0.4671, ..., 0.4671, 0.4671, 0.4671],
[ 0.4671, 0.4671, 0.4671, ..., 0.4671, 0.4671, 0.4671],
[ 0.4671, 0.4671, 0.4671, ..., 0.4671, 0.4671, 0.4671]]]],
device='cuda:0'),)
kwargs = {'padding_mask': None, 'mask': False, 'remove_extra_tokens': True}
(Pdb) s
--Call--
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1514)_wrapped_call_impl()
-> def _wrapped_call_impl(self, *args, **kwargs):
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1515)_wrapped_call_impl()
-> if self._compiled_call_impl is not None:
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1518)_wrapped_call_impl()
-> return self._call_impl(*args, **kwargs)
(Pdb) s
--Call--
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1520)_call_impl()
-> def _call_impl(self, *args, **kwargs):
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1521)_call_impl()
-> forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward)
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1524)_call_impl()
-> if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1525)_call_impl()
-> or _global_backward_pre_hooks or _global_backward_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1524)_call_impl()
-> if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1525)_call_impl()
-> or _global_backward_pre_hooks or _global_backward_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1524)_call_impl()
-> if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1526)_call_impl()
-> or _global_forward_hooks or _global_forward_pre_hooks):
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1524)_call_impl()
-> if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1526)_call_impl()
-> or _global_forward_hooks or _global_forward_pre_hooks):
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1524)_call_impl()
-> if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
(Pdb) s
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1527)_call_impl()
-> return forward_call(*args, **kwargs)
(Pdb) s
TypeError: forward() got an unexpected keyword argument 'padding_mask'
> /home/luotao.zhang/anaconda3/envs/audioeat/lib/python3.8/site-packages/torch/nn/modules/module.py(1527)_call_impl()
-> return forward_call(*args, **kwargs)
(Pdb)
我测试了一下,你这种情况应该是使用了 finetune 版本 checkpoint,但是使用 EAT/scripts/feature_extract.sh
脚本的时候没有将对应不同类型 checkpoint 的参数 mode
设置为 finetune
(脚本中默认设置是 mode='pretrain'
)~
是的是的,非常感谢,我也刚测试出来,一个小问题耽误您好久
没事没事~
您好!我在使用 bash EAT/scripts/feature_extract.sh 提取音频时遇到错误“TypeError: forward() got an unexpected keyword argument 'padding_mask'”。下面是详细报错,是不是我装的某些依赖的版本不对?我应该怎么fix呢?
Environment