Open liangcaihua opened 2 years ago
第1种:
import numpy as np import torch
n_ctx = model.config.n_ctx temperature = 2 repitition_penalty = 1.7
model.eval() with torch.no_grad(): for generated in '家无铜臭孔方兄':
for _ in range(120): inputs = generated eval_inputs = full_tokenizer(inputs, add_special_tokens=False, truncation=True, return_tensors='pt') eval_inputs.to(device) #print(eval_inputs.input_ids.shape) outputs = model(**eval_inputs) next_token_logits = outputs.logits[0, -1, :] #print('outputs:', outputs.logits.shape, next_token_logits.shape) # 诗歌的要防止重复录入相同的单词 for id in set(full_tokenizer.encode(generated[-17:].replace(',','').replace('。',''))): # 原始的输入惩罚下,因为输出很有可能和输入不一样 #其实没多大作用,只是为了让输出的词尽量不用输入的词 next_token_logits[id] /= repitition_penalty #next_token_logits = next_token_logits / temperature #整体热度降低 #未知单词的权重降低 next_token_logits[full_tokenizer.convert_tokens_to_ids(['[PAD]','[UNK]'])] = -float('Inf') values, indices = torch.topk(next_token_logits, 10) p = F.softmax(values, dim=-1) #if _>490: print(p) next_id = np.random.choice(indices.cpu().numpy(), 1, p=p.cpu().numpy()) #print(full_tokenizer.decode(next_id)) generated += full_tokenizer.decode(next_id) text = generated.replace(' ','') print(text.replace('[CLS]','\n'), '\n')
第二种: eval_inputs = full_tokenizer('[CLS]', return_tensors='pt', add_special_tokens=False) eval_inputs.to(device)
print(eval_inputs) out = model.generate(eval_inputs.input_ids, max_length=100, temperature=1.0, top_k=8, top_p=0.3, repetition_penalty=1.0, do_sample=True, num_return_sequences=1)
out.shape, [full_tokenizer.decode(c).replace(' ','')[5:]+'\n' for c in out]
使用第二种生成的如下:
'一片清光万里同,几人曾见月如弓。只今犹有中秋月,曾照当时照大东。[SEP]脚亭亭似水哉,天风吹下玉楼台。夜深环佩归何处,更向桥边礼斗来。[SEP]人无限好楼台,楼上笙歌次第催。一片笙歌人并醉,月中倒载舞衣回
第1种:
import numpy as np import torch
torch.manual_seed(2)
n_ctx = model.config.n_ctx temperature = 2 repitition_penalty = 1.7
model.eval() with torch.no_grad(): for generated in '家无铜臭孔方兄':
第二种: eval_inputs = full_tokenizer('[CLS]', return_tensors='pt', add_special_tokens=False) eval_inputs.to(device)
print(eval_inputs) out = model.generate(eval_inputs.input_ids, max_length=100, temperature=1.0, top_k=8, top_p=0.3, repetition_penalty=1.0, do_sample=True, num_return_sequences=1)
out.shape, [full_tokenizer.decode(c).replace(' ','')[5:]+'\n' for c in out]