Open Essence9999 opened 8 months ago
ubuntu 22.04 server; GPU A10 24G; attention_sinks 0.4.0
`import torch from transformers import AutoTokenizer, TextStreamer, GenerationConfig from attention_sinks import AutoModelForCausalLM
model_id = "/home/work/projects/model/Qwen-7B" model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, attention_sink_size=4, attention_sink_window_size=252, trust_remote_code=True, use_flash_attn=False ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer.pad_token_id = tokenizer.eos_token_id
text = "保持身体健康有多种方式" input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer) generation_config=GenerationConfig( use_cache=True, min_new_tokens=100_000, max_new_tokens=1_000_000, penalty_alpha=0.6, top_k=5, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) generated_tokens = model.generate( input_ids, generation_config, streamer=streamer, )
output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)`
The model generation is over normally,do not generate long text output:
But when I use the finetuned Qwen-base model to generate, after generating some text, the error is coming:
My code is as follows:
import torch from transformers import AutoTokenizer, TextStreamer, GenerationConfig from attention_sinks import AutoModelForCausalLM
model_id = "/home/work/projects/LLaMA-Factory/output/qwen_base"
model = AutoModelForCausalLM.from_pretrained( model_id,
device_map="auto", torch_dtype=torch.float16, attention_sink_size=16, attention_sink_window_size=1024, trust_remote_code=True, use_flash_attn=False
)
model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer.pad_token_id = tokenizer.eos_token_id
text = "写一篇文章,主题为:5G网络架构探讨"
input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)
with torch.no_grad(): streamer = TextStreamer(tokenizer) generated_tokens = model.generate( input_ids, generation_config=GenerationConfig( use_cache=True, min_new_tokens=100_000, max_new_tokens=1_000_000, penalty_alpha=0.6, top_k=5, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ), streamer=streamer, ) output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
ubuntu 22.04 server; GPU A10 24G; attention_sinks 0.4.0
`import torch from transformers import AutoTokenizer, TextStreamer, GenerationConfig from attention_sinks import AutoModelForCausalLM
model_id = "/home/work/projects/model/Qwen-7B" model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, attention_sink_size=4, attention_sink_window_size=252, trust_remote_code=True, use_flash_attn=False ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer.pad_token_id = tokenizer.eos_token_id
text = "保持身体健康有多种方式" input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer) generation_config=GenerationConfig( use_cache=True, min_new_tokens=100_000, max_new_tokens=1_000_000, penalty_alpha=0.6, top_k=5, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) generated_tokens = model.generate( input_ids, generation_config, streamer=streamer, )
output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)`
The model generation is over normally,do not generate long text output:
But when I use the finetuned Qwen-base model to generate, after generating some text, the error is coming:
My code is as follows:
import torch from transformers import AutoTokenizer, TextStreamer, GenerationConfig from attention_sinks import AutoModelForCausalLM
model_id = "/home/work/projects/LLaMA-Factory/output/qwen_base"
model = AutoModelForCausalLM.from_pretrained( model_id,
for efficiency:
)
model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer.pad_token_id = tokenizer.eos_token_id
text = "写一篇文章,主题为:5G网络架构探讨"
input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)
with torch.no_grad(): streamer = TextStreamer(tokenizer) generated_tokens = model.generate( input_ids, generation_config=GenerationConfig( use_cache=True, min_new_tokens=100_000, max_new_tokens=1_000_000, penalty_alpha=0.6, top_k=5, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ), streamer=streamer, ) output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)