Open GeorgeS2019 opened 2 years ago
converted bert.embeddings.word_embeddings.weight - 15627392 bytes
converted bert.embeddings.position_embeddings.weight - 262272 bytes
converted bert.embeddings.token_type_embeddings.weight - 1152 bytes
converted bert.embeddings.LayerNorm.weight - 624 bytes
converted bert.embeddings.LayerNorm.bias - 624 bytes
converted bert.encoder.layer.0.attention.self.query.weight - 65664 bytes
converted bert.encoder.layer.0.attention.self.query.bias - 624 bytes
converted bert.encoder.layer.0.attention.self.key.weight - 65664 bytes
converted bert.encoder.layer.0.attention.self.key.bias - 624 bytes
converted bert.encoder.layer.0.attention.self.value.weight - 65664 bytes
converted bert.encoder.layer.0.attention.self.value.bias - 624 bytes
converted bert.encoder.layer.0.attention.output.dense.weight - 65664 bytes
converted bert.encoder.layer.0.attention.output.dense.bias - 624 bytes
converted bert.encoder.layer.0.attention.output.LayerNorm.weight - 624 bytes
converted bert.encoder.layer.0.attention.output.LayerNorm.bias - 624 bytes
converted bert.encoder.layer.0.intermediate.dense.weight - 262272 bytes
converted bert.encoder.layer.0.intermediate.dense.bias - 2160 bytes
converted bert.encoder.layer.0.output.dense.weight - 262272 bytes
converted bert.encoder.layer.0.output.dense.bias - 624 bytes
converted bert.encoder.layer.0.output.LayerNorm.weight - 624 bytes
converted bert.encoder.layer.0.output.LayerNorm.bias - 624 bytes
converted bert.encoder.layer.1.attention.self.query.weight - 65664 bytes
converted bert.encoder.layer.1.attention.self.query.bias - 624 bytes
converted bert.encoder.layer.1.attention.self.key.weight - 65664 bytes
converted bert.encoder.layer.1.attention.self.key.bias - 624 bytes
converted bert.encoder.layer.1.attention.self.value.weight - 65664 bytes
converted bert.encoder.layer.1.attention.self.value.bias - 624 bytes
converted bert.encoder.layer.1.attention.output.dense.weight - 65664 bytes
converted bert.encoder.layer.1.attention.output.dense.bias - 624 bytes
converted bert.encoder.layer.1.attention.output.LayerNorm.weight - 624 bytes
converted bert.encoder.layer.1.attention.output.LayerNorm.bias - 624 bytes
converted bert.encoder.layer.1.intermediate.dense.weight - 262272 bytes
converted bert.encoder.layer.1.intermediate.dense.bias - 2160 bytes
converted bert.encoder.layer.1.output.dense.weight - 262272 bytes
converted bert.encoder.layer.1.output.dense.bias - 624 bytes
converted bert.encoder.layer.1.output.LayerNorm.weight - 624 bytes
converted bert.encoder.layer.1.output.LayerNorm.bias - 624 bytes
converted bert.pooler.dense.weight - 65664 bytes
converted bert.pooler.dense.bias - 624 bytes
converted cls.predictions.bias - 122200 bytes
converted cls.predictions.transform.dense.weight - 65664 bytes
converted cls.predictions.transform.dense.bias - 624 bytes
converted cls.predictions.transform.LayerNorm.weight - 624 bytes
converted cls.predictions.transform.LayerNorm.bias - 624 bytes
converted cls.predictions.decoder.weight - 15627392 bytes
converted cls.predictions.decoder.bias - 122200 bytes
converted cls.seq_relationship.weight - 1152 bytes
converted cls.seq_relationship.bias - 120 bytes
15,627,392 bert.embeddings.word_embeddings.weight.npy
262,272 bert.embeddings.position_embeddings.weight.npy
1,152 bert.embeddings.token_type_embeddings.weight.npy
640 bert.embeddings.LayerNorm.weight.npy
640 bert.embeddings.LayerNorm.bias.npy
65,664 bert.encoder.layer.0.attention.self.query.weight.npy
640 bert.encoder.layer.0.attention.self.query.bias.npy
65,664 bert.encoder.layer.0.attention.self.key.weight.npy
640 bert.encoder.layer.0.attention.self.key.bias.npy
65,664 bert.encoder.layer.0.attention.self.value.weight.npy
640 bert.encoder.layer.0.attention.self.value.bias.npy
65,664 bert.encoder.layer.0.attention.output.dense.weight.npy
640 bert.encoder.layer.0.attention.output.dense.bias.npy
640 bert.encoder.layer.0.attention.output.LayerNorm.weight.npy
640 bert.encoder.layer.0.attention.output.LayerNorm.bias.npy
262,272 bert.encoder.layer.0.intermediate.dense.weight.npy
2,176 bert.encoder.layer.0.intermediate.dense.bias.npy
262,272 bert.encoder.layer.0.output.dense.weight.npy
640 bert.encoder.layer.0.output.dense.bias.npy
640 bert.encoder.layer.0.output.LayerNorm.weight.npy
640 bert.encoder.layer.0.output.LayerNorm.bias.npy
65,664 bert.encoder.layer.1.attention.self.query.weight.npy
640 bert.encoder.layer.1.attention.self.query.bias.npy
65,664 bert.encoder.layer.1.attention.self.key.weight.npy
640 bert.encoder.layer.1.attention.self.key.bias.npy
65,664 bert.encoder.layer.1.attention.self.value.weight.npy
640 bert.encoder.layer.1.attention.self.value.bias.npy
65,664 bert.encoder.layer.1.attention.output.dense.weight.npy
640 bert.encoder.layer.1.attention.output.dense.bias.npy
640 bert.encoder.layer.1.attention.output.LayerNorm.weight.npy
640 bert.encoder.layer.1.attention.output.LayerNorm.bias.npy
262,272 bert.encoder.layer.1.intermediate.dense.weight.npy
2,176 bert.encoder.layer.1.intermediate.dense.bias.npy
262,272 bert.encoder.layer.1.output.dense.weight.npy
640 bert.encoder.layer.1.output.dense.bias.npy
640 bert.encoder.layer.1.output.LayerNorm.weight.npy
640 bert.encoder.layer.1.output.LayerNorm.bias.npy
65,664 bert.pooler.dense.weight.npy
640 bert.pooler.dense.bias.npy
122,216 cls.predictions.bias.npy
65,664 cls.predictions.transform.dense.weight.npy
640 cls.predictions.transform.dense.bias.npy
640 cls.predictions.transform.LayerNorm.weight.npy
640 cls.predictions.transform.LayerNorm.bias.npy
15,627,392 cls.predictions.decoder.weight.npy
122,216 cls.predictions.decoder.bias.npy
@GeorgeS2019 thanks for the update - good to know
Note that IF your model uses PyTorch/TorchSharp tranformer
layer, e.g.
let encoderLayer = torch.nn.TransformerEncoderLayer(HIDDEN, N_HEADS, MAX_POS_EMB, ATTN_DROPOUT_PROB, activation=ENCODER_ACTIVATION)
let encoder = torch.nn.TransformerEncoder(encoderLayer, ENCODER_LAYERS)
THEN several of the BERT layers have to be package together to load the encoder weights correctly, e.g.
type PostProc = V | H | T | N
let postProc (ts:torch.Tensor list) = function
| V -> torch.vstack(ResizeArray ts)
| H -> torch.hstack(ResizeArray ts)
| T -> ts.Head.T //Linear layer weights need to be transformed. See https://github.com/pytorch/pytorch/issues/2159
| N -> ts.Head
let nameMap =
[
"encoder.layers.#.self_attn.in_proj_weight",["encoder/layer_#/attention/self/query/kernel";
"encoder/layer_#/attention/self/key/kernel";
"encoder/layer_#/attention/self/value/kernel"], V
"encoder.layers.#.self_attn.in_proj_bias", ["encoder/layer_#/attention/self/query/bias";
"encoder/layer_#/attention/self/key/bias";
"encoder/layer_#/attention/self/value/bias"], H
...
In the nameMap 3-tuple list above, the 1st tuple corresponds to the torchsharp layer name and the 2nd tuple is a list of bert layers that should be concatenated together to match the PyTorch layer weights. Note PostProc V, H (3rd tuple) represents Vertical or Horizontal stacking, respectively.
If the weights are from TensorFlow then 'linear' layer weights have to be transformed first ( PostProc.T ). (See the full notebook for details). If the weights are from a PyTorch model then this transformation may not be needed.
I believe HuggingFace versions don't use the PyTorch 'transfomer' layer. They use base layers that taken together are equivalent to a tranformer layer.
bert_uncased_L-2_H-128_A-2
pytorch_model.bin
@fwaris With the implementation of this PR it is now possible to extract pre-trained weights from pytorch_model.bin and save inTorchSharp with name_tensor and weight