Here is my env. The version of transfomers is meet the requirements in monkeypatch.py
torch==2.2.0
transfomers==4.37.0
The traceback are as follows:
traceback
>> python pred_snap.py --model llama2-7b-chat-4k --compress_args_path ablation_c1024_w32_k7_maxpool.json
Traceback (most recent call last):
File "experiments/LongBench/pred_snap.py", line 321, in
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "experiments/LongBench/pred_snap.py", line 132, in get_pred_single_gpu
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 1474, in generate
return self.greedy_search(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 2335, in greedy_search
outputs = self(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1183, in forward
outputs = self.model(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1035, in forward
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py", line 398, in _prepare_4d_causal_attention_mask_for_sdpa
expanded_4d_mask = attn_mask_converter.to_4d(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py", line 137, in to_4d
expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min)
RuntimeError: The size of tensor a (3509) must match the size of tensor b (7017) at non-singleton dimension 3
I think the reason would be DynamicCache.get_usable_length conflict with the getting-casual-mask function _prepare_4d_causal_attention_mask_for_sdpa.
I would like to know how can I quick fix this. Thx :)
Here is my env. The version of
transfomers
is meet the requirements inmonkeypatch.py
The traceback are as follows:
traceback
>> python pred_snap.py --model llama2-7b-chat-4k --compress_args_path ablation_c1024_w32_k7_maxpool.json Traceback (most recent call last): File "experiments/LongBench/pred_snap.py", line 321, in
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "experiments/LongBench/pred_snap.py", line 132, in get_pred_single_gpu
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 1474, in generate
return self.greedy_search(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 2335, in greedy_search
outputs = self(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1183, in forward
outputs = self.model(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1035, in forward
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py", line 398, in _prepare_4d_causal_attention_mask_for_sdpa
expanded_4d_mask = attn_mask_converter.to_4d(
File "/data1/ss/anaconda3/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py", line 137, in to_4d
expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min)
RuntimeError: The size of tensor a (3509) must match the size of tensor b (7017) at non-singleton dimension 3
I think the reason would be
DynamicCache.get_usable_length
conflict with the getting-casual-mask function_prepare_4d_causal_attention_mask_for_sdpa
.I would like to know how can I quick fix this. Thx :)