NVIDIA / FasterTransformer

Transformer related optimization, including BERT, GPT
Apache License 2.0
5.77k stars 882 forks source link

Running deberta gives me different result for fastertransformer vs huggingface #707

Open sfc-gh-zhwang opened 1 year ago

sfc-gh-zhwang commented 1 year ago

Since there is not code example provided for pytorch, I wrote my own version below. But it turned out it gave different result than using the huggingface model. Anything wrong with my code?

import torch 

def listed_weights(weights: dict):
    ret = []
    start_layer = 0
    end_layer = 24
    tensor_para_size = 1
    tensor_para_rank = 0
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' + 'attention.self.query_proj.weight'].transpose(-1, -2)
                       for layer_idx in range(start_layer, end_layer)], 0).contiguous())       # 0
    print(ret[-1].shape)
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    print(ret[-1].shape)
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
                       'attention.self.query_proj.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' + 'attention.self.key_proj.weight'].transpose(-1, -2)
                           for layer_idx in range(start_layer, end_layer)], 0).contiguous())         # 2
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'attention.self.key_proj.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' + 'attention.self.value_proj.weight'].transpose(-1, -2)
               for layer_idx in range(start_layer, end_layer)], 0).contiguous())       # 4
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'attention.self.value_proj.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' + 'attention.output.dense.weight'].transpose(-1, -2)
               for layer_idx in range(start_layer, end_layer)], 0).contiguous())     # 6
    ret[-1] = ret[-1].split(ret[-1].shape[1] // tensor_para_size,
                            dim=1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'attention.output.dense.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'attention.output.LayerNorm.weight'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'attention.output.LayerNorm.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' + 'intermediate.dense.weight'].transpose(-1, -2)
               for layer_idx in range(start_layer, end_layer)], 0).contiguous())         # 10
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'intermediate.dense.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret[-1] = ret[-1].split(ret[-1].shape[-1] // tensor_para_size,
                            dim=-1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' + 'output.dense.weight'].transpose(-1, -2)
               for layer_idx in range(start_layer, end_layer)], 0).contiguous())               # 12
    ret[-1] = ret[-1].split(ret[-1].shape[1] // tensor_para_size,
                            dim=1)[tensor_para_rank].contiguous()
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'output.dense.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'output.LayerNorm.weight'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret.append(torch.stack([weights['deberta.encoder.layer.' + str(layer_idx) + '.' +
               'output.LayerNorm.bias'] for layer_idx in range(start_layer, end_layer)], 0).contiguous())
    ret.append(weights['deberta.embeddings.word_embeddings.weight'])
    ret.append(weights['deberta.embeddings.LayerNorm.weight'])
    ret.append(weights['deberta.embeddings.LayerNorm.bias'])
    ret.append(weights['deberta.encoder.rel_embeddings.weight'])
    ret.append(weights['deberta.encoder.LayerNorm.weight'])
    ret.append(weights['deberta.encoder.LayerNorm.bias'])
    return ret

model_dir = '/data/models/debertaV2/'
weights = torch.load(f'{model_dir}pytorch_model.bin')

layer_num = 24
head_num = 16
head_size = 64
max_relative_positions = 256
relative_position_buckets = 256
remove_padding = True
sparse = False
tensor_para_size = 1
pipeline_para_size = 1
inter_size = 4*head_num*head_size

class CustomEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.classes.load_library(ths_path)
        self.encoder = torch.classes.FasterTransformer.Deberta(
            *w,
            head_num, #     int64_t    head_num,
            head_size, #     int64_t    head_size,
            max_relative_positions, #     int64_t    max_relative_positions,
            relative_position_buckets, #     int64_t    relative_position_buckets,
            inter_size,
            remove_padding,
            layer_num,
            sparse,
            1.0,
            tensor_para_size,
            pipeline_para_size
        )

    def forward(self, input_ids, sequence_lengths):
            hidden_states = self.encoder.forward(input_ids.to('cuda'), sequence_lengths.to('cuda'))
            return hidden_states        
custom_encoder = CustomEncoder()
custom_encoder = torch.jit.script(custom_encoder)
jonhilgart22 commented 1 year ago

Following

flexwang commented 1 year ago

Self-answer: turns out q_scaling should be sqrt(3.0) instead of 1.0, but I don't really understand the underlying reason. Is it because Deberta has c2c, c2p, p2c while Bert only has c2c?