该函数的引用方式:
from paddleslim.nas.ofa.utils import nlp_utils
该函数的原文:
def compute_neuron_head_importance(task_name,
model,
data_loader,
num_layers,
num_heads,
loss_fct=paddle.nn.loss.CrossEntropyLoss(),
intermediate_name='linear1',
output_name='linear2'):
"""
Compute the importance of multi-head attention and feed-forward neuron in each transformer layer.
Args:
task_name(str): task name.
model(paddle.nn.Layer): the instance of transformer model.
data_loader(DataLoader): An iterable data loader is used for evaluate. An instance of `paddle.io.Dataloader`.
num_layers(int): number of transformer layers.
num_heads(int): number of heads in each multi-head attention.
loss_fct(Loss|optional): loss function can be a `paddle.nn.Layer` instance. Default: `nn.loss.CrossEntropyLoss()`.
intermediate_name(str|optional): the name of intermediate `Linear` layer in feed-forward. Default: `linear1`.
output_name(str|optional): the name of output `Linear` layer in feed-forward. Default: `linear2`.
"""
head_importance = paddle.zeros(
shape=[num_layers, num_heads], dtype='float32')
head_mask = paddle.ones(shape=[num_layers, num_heads], dtype='float32')
head_mask.stop_gradient = False
intermediate_weight = []
intermediate_bias = []
output_weight = []
for name, w in model.named_parameters():
if intermediate_name in name:
if len(w.shape) > 1:
intermediate_weight.append(w)
else:
intermediate_bias.append(w)
if output_name in name:
if len(w.shape) > 1:
output_weight.append(w)
neuron_importance = []
for w in intermediate_weight:
neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32'))
if task_name.lower() != 'mnli':
data_loader = (data_loader, )
for data in data_loader:
for batch in data:
if isinstance(batch, dict):
input_ids, segment_ids, labels = batch['input_ids'], batch[
'token_type_ids'], batch['labels']
else:
input_ids, segment_ids, labels = batch
logits = model(
input_ids, segment_ids, attention_mask=[None, head_mask])
loss = loss_fct(logits, labels)
loss.backward()
head_importance += paddle.abs(
paddle.to_tensor(head_mask.gradient()))
for w1, b1, w2, current_importance in zip(
intermediate_weight, intermediate_bias, output_weight,
neuron_importance):
current_importance += np.abs(
(np.sum(w1.numpy() * w1.gradient(), axis=0) + b1.numpy() *
b1.gradient()))
current_importance += np.abs(
np.sum(w2.numpy() * w2.gradient(), axis=1))
return head_importance, neuron_importance
该函数的引用方式: from paddleslim.nas.ofa.utils import nlp_utils 该函数的原文: def compute_neuron_head_importance(task_name, model, data_loader, num_layers, num_heads, loss_fct=paddle.nn.loss.CrossEntropyLoss(), intermediate_name='linear1', output_name='linear2'): """ Compute the importance of multi-head attention and feed-forward neuron in each transformer layer.
在使用该函数时,我遇到了报错: AttributeError Traceback (most recent call last) Cell In[46], line 180 172 dev_batch_sampler = paddle.io.BatchSampler( 173 dev_ds, batch_size=4, shuffle=False) 174 dev_data_loader = DataLoader( 175 dataset=dev_ds, 176 #batch_sampler=dev_batch_sampler, 177 #collate_fn=batchify_fn 178 ) --> 180 head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( 181 task_name='cluewsc2020', 182 model=ofa_model.model, 183 data_loader=dev_ds, 184 loss_fct=paddle.nn.loss.CrossEntropyLoss( 185 ) if [True,False] else paddle.nn.loss.MSELoss(), 186 num_layers=model.ppminilm.config['num_hidden_layers'], 187 num_heads=model.ppminilm.config['num_attention_heads']) 189 # 重新组合参数的顺序 190 reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleslim/nas/ofa/utils/nlp_utils.py:76, in compute_neuron_head_importance(task_name, model, data_loader, num_layers, num_heads, loss_fct, intermediate_name, output_name) 74 else: 75 input_ids, segment_ids, labels = batch ---> 76 logits = model( 77 input_ids, segment_ids, attention_mask=[None, head_mask]) 78 loss = loss_fct(logits, labels) 79 loss.backward()
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1426, in Layer.call(self, *inputs, kwargs) 1417 if ( 1418 (not in_to_static_mode()) 1419 and (not self._forward_pre_hooks) (...) 1423 and (not in_profiler_mode()) 1424 ): 1425 self._build_once(*inputs, *kwargs) -> 1426 return self.forward(inputs, kwargs) 1427 else: 1428 return self._dygraph_call_func(*inputs, **kwargs)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddlenlp/transformers/ppminilm/modeling.py:300, in PPMiniLMForSequenceClassification.forward(self, input_ids, token_type_ids, position_ids, attention_mask) 270 def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): 271 r""" 272 Args: 273 inputids (Tensor): (...) 298 299 """ --> 300 , pooled_output = self.ppminilm( 301 input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask 302 ) 304 pooled_output = self.dropout(pooled_output) 305 logits = self.classifier(pooled_output)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1426, in Layer.call(self, *inputs, kwargs) 1417 if ( 1418 (not in_to_static_mode()) 1419 and (not self._forward_pre_hooks) (...) 1423 and (not in_profiler_mode()) 1424 ): 1425 self._build_once(*inputs, *kwargs) -> 1426 return self.forward(inputs, kwargs) 1427 else: 1428 return self._dygraph_call_func(*inputs, **kwargs)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddlenlp/transformers/ppminilm/modeling.py:230, in PPMiniLMModel.forward(self, input_ids, token_type_ids, position_ids, attention_mask) 226 attention_mask = paddle.unsqueeze( 227 (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) -1e4, axis=[1, 2] 228 ) 229 else: --> 230 if attention_mask.ndim == 2: 231 # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length] 232 attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype()) 233 attention_mask = (1.0 - attention_mask) -1e4
AttributeError: 'list' object has no attribute 'ndim'
经过我的甄别,我觉得该函数的attention_mask部分写的有问题: input_ids, segment_ids, attention_mask=[None, head_mask]) 在这一行代码中,attention_mask=[None, head_mask],这导致了函数的报错