Closed guofei1989 closed 3 years ago
Same problem, can be reproduced stably using the following script:
from argparse import ArgumentParser
import random
import torch
import transformers
import turbo_transformers
def main(args):
device = torch.device(args.device)
config = transformers.BertConfig()
torch_model = transformers.BertModel(config)
model = turbo_transformers.BertModel.from_torch(
torch_model, device=device,
)
possible_lengths = list(range(args.input_min_len, args.input_max_len))
possible_ids = list(range(1, config.vocab_size))
while True:
batch_input_ids = []
batch_attention_masks = []
for _ in range(args.batch_size):
input_ids = []
attention_masks = []
length = random.choice(possible_lengths)
for i in range(length):
input_ids.append(random.choice(possible_ids))
attention_masks.append(1)
while len(input_ids) < args.input_max_len:
input_ids.append(0)
attention_masks.append(0)
batch_input_ids.append(input_ids)
batch_attention_masks.append(attention_masks)
batch_input_ids = torch.tensor(batch_input_ids, device=device)
batch_attention_masks = torch.tensor(batch_attention_masks, device=device)
outputs = model(batch_input_ids, attention_masks=batch_attention_masks)
if __name__ == "__main__":
arg_parser = ArgumentParser()
arg_parser.add_argument("--device", type=str, choices=["cpu", "cuda"], default="cuda")
arg_parser.add_argument("--input_min_len", type=int, default=5)
arg_parser.add_argument("--input_max_len", type=int, default=150)
arg_parser.add_argument("--batch_size", type=int, default=16)
args = arg_parser.parse_args()
main(args)
Thanks, I will check it.
The leak comes from BertAttetion, other modules like BertIntermediate won't leak
from argparse import ArgumentParser
import torch
import transformers
import turbo_transformers
from turbo_transformers.layers.return_type import ReturnType
from turbo_transformers.layers.utils import convert2tt_tensor
def main(args):
device = torch.device(args.device)
config = transformers.BertConfig()
torch_model = transformers.BertModel(config)
torch_model.cuda()
bert_attention = turbo_transformers.BertAttention.from_torch(
torch_model.encoder.layer[0].attention,
)
batch_hidden_states = torch.rand(args.batch_size, args.seq_len, config.hidden_size).to(device)
batch_attention_masks = torch.ones(args.batch_size, args.seq_len).to(device)
batch_hidden_states = convert2tt_tensor(batch_hidden_states)
batch_attention_masks = convert2tt_tensor(batch_attention_masks)
while True:
outputs = bert_attention(batch_hidden_states, attention_mask=batch_attention_masks,
return_type=ReturnType.turbo_transformers)
if __name__ == "__main__":
arg_parser = ArgumentParser()
arg_parser.add_argument("--device", type=str, choices=["cpu", "cuda"], default="cuda")
arg_parser.add_argument("--seq_len", type=int, default=100)
arg_parser.add_argument("--batch_size", type=int, default=16)
args = arg_parser.parse_args()
main(args)
After the author updated the docker image thufeifeibear/turbo_transformers_gpu:latest
, this problem is fixed for me
感谢作者提供的工具,确实在速度上提升不少。
目前将模型部署后(采用docker镜像安装)发现无论是cpu还是gpu版都稳定出现内存泄露问题(已与原生pytorch对比排除其它原因,平均每推断1w条数据增加内存100M),input_tensor维度(8, 512, 768)。
此外,大神在https://github.com/Tencent/TurboTransformers/issues/191#issuecomment-743036566 中建议可尝试下dev版本,无奈因为网络环境问题在导submodule时一直没有安装成功,不知可否提供docker镜像版。