PaddlePaddle / PaddleSlim

PaddleSlim is an open-source library for deep model compression and architecture search.
https://paddleslim.readthedocs.io/zh_CN/latest/
Apache License 2.0
1.56k stars 345 forks source link

似乎你们的剪枝相关的函数写的有点问题 #1890

Open yedaotian9 opened 3 months ago

yedaotian9 commented 3 months ago

该函数的引用方式: from paddleslim.nas.ofa.utils import nlp_utils 该函数的原文: def compute_neuron_head_importance(task_name, model, data_loader, num_layers, num_heads, loss_fct=paddle.nn.loss.CrossEntropyLoss(), intermediate_name='linear1', output_name='linear2'): """ Compute the importance of multi-head attention and feed-forward neuron in each transformer layer.

Args:
    task_name(str): task name.
    model(paddle.nn.Layer): the instance of transformer model.
    data_loader(DataLoader): An iterable data loader is used for evaluate. An instance of `paddle.io.Dataloader`.
    num_layers(int): number of transformer layers.
    num_heads(int): number of heads in each multi-head attention.
    loss_fct(Loss|optional): loss function can be a `paddle.nn.Layer` instance. Default: `nn.loss.CrossEntropyLoss()`.
    intermediate_name(str|optional): the name of intermediate `Linear` layer in feed-forward. Default: `linear1`.
    output_name(str|optional): the name of output `Linear` layer in feed-forward. Default: `linear2`.
"""
head_importance = paddle.zeros(
    shape=[num_layers, num_heads], dtype='float32')
head_mask = paddle.ones(shape=[num_layers, num_heads], dtype='float32')
head_mask.stop_gradient = False

intermediate_weight = []
intermediate_bias = []
output_weight = []

for name, w in model.named_parameters():
    if intermediate_name in name:
        if len(w.shape) > 1:
            intermediate_weight.append(w)
        else:
            intermediate_bias.append(w)

    if output_name in name:
        if len(w.shape) > 1:
            output_weight.append(w)

neuron_importance = []
for w in intermediate_weight:
    neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32'))

if task_name.lower() != 'mnli':
    data_loader = (data_loader, )
for data in data_loader:
    for batch in data:
        if isinstance(batch, dict):
            input_ids, segment_ids, labels = batch['input_ids'], batch[
                'token_type_ids'], batch['labels']
        else:
            input_ids, segment_ids, labels = batch
        logits = model(
            input_ids, segment_ids, attention_mask=[None, head_mask])
        loss = loss_fct(logits, labels)
        loss.backward()
        head_importance += paddle.abs(
            paddle.to_tensor(head_mask.gradient()))

        for w1, b1, w2, current_importance in zip(
                intermediate_weight, intermediate_bias, output_weight,
                neuron_importance):
            current_importance += np.abs(
                (np.sum(w1.numpy() * w1.gradient(), axis=0) + b1.numpy() *
                 b1.gradient()))
            current_importance += np.abs(
                np.sum(w2.numpy() * w2.gradient(), axis=1))

return head_importance, neuron_importance

在使用该函数时,我遇到了报错: AttributeError Traceback (most recent call last) Cell In[46], line 180 172 dev_batch_sampler = paddle.io.BatchSampler( 173 dev_ds, batch_size=4, shuffle=False) 174 dev_data_loader = DataLoader( 175 dataset=dev_ds, 176 #batch_sampler=dev_batch_sampler, 177 #collate_fn=batchify_fn 178 ) --> 180 head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( 181 task_name='cluewsc2020', 182 model=ofa_model.model, 183 data_loader=dev_ds, 184 loss_fct=paddle.nn.loss.CrossEntropyLoss( 185 ) if [True,False] else paddle.nn.loss.MSELoss(), 186 num_layers=model.ppminilm.config['num_hidden_layers'], 187 num_heads=model.ppminilm.config['num_attention_heads']) 189 # 重新组合参数的顺序 190 reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleslim/nas/ofa/utils/nlp_utils.py:76, in compute_neuron_head_importance(task_name, model, data_loader, num_layers, num_heads, loss_fct, intermediate_name, output_name) 74 else: 75 input_ids, segment_ids, labels = batch ---> 76 logits = model( 77 input_ids, segment_ids, attention_mask=[None, head_mask]) 78 loss = loss_fct(logits, labels) 79 loss.backward()

File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1426, in Layer.call(self, *inputs, kwargs) 1417 if ( 1418 (not in_to_static_mode()) 1419 and (not self._forward_pre_hooks) (...) 1423 and (not in_profiler_mode()) 1424 ): 1425 self._build_once(*inputs, *kwargs) -> 1426 return self.forward(inputs, kwargs) 1427 else: 1428 return self._dygraph_call_func(*inputs, **kwargs)

File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddlenlp/transformers/ppminilm/modeling.py:300, in PPMiniLMForSequenceClassification.forward(self, input_ids, token_type_ids, position_ids, attention_mask) 270 def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): 271 r""" 272 Args: 273 inputids (Tensor): (...) 298 299 """ --> 300 , pooled_output = self.ppminilm( 301 input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask 302 ) 304 pooled_output = self.dropout(pooled_output) 305 logits = self.classifier(pooled_output)

File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1426, in Layer.call(self, *inputs, kwargs) 1417 if ( 1418 (not in_to_static_mode()) 1419 and (not self._forward_pre_hooks) (...) 1423 and (not in_profiler_mode()) 1424 ): 1425 self._build_once(*inputs, *kwargs) -> 1426 return self.forward(inputs, kwargs) 1427 else: 1428 return self._dygraph_call_func(*inputs, **kwargs)

File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddlenlp/transformers/ppminilm/modeling.py:230, in PPMiniLMModel.forward(self, input_ids, token_type_ids, position_ids, attention_mask) 226 attention_mask = paddle.unsqueeze( 227 (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) -1e4, axis=[1, 2] 228 ) 229 else: --> 230 if attention_mask.ndim == 2: 231 # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length] 232 attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype()) 233 attention_mask = (1.0 - attention_mask) -1e4

AttributeError: 'list' object has no attribute 'ndim'

经过我的甄别,我觉得该函数的attention_mask部分写的有问题: input_ids, segment_ids, attention_mask=[None, head_mask]) 在这一行代码中,attention_mask=[None, head_mask],这导致了函数的报错

minghaoBD commented 1 month ago

你好,抱歉回复不及时。这个是NAS的模块,直接用剪枝请参照 https://github.com/PaddlePaddle/PaddleSlim/blob/release/2.0.0/docs/zh_cn/tutorials/pruning/overview.md

另外,如果需要针对LLM做剪枝,需要适配下,PaddleSlim暂不支持。