from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
Downloading: 100%|██████████████████████████████████████████████████████████████████████| 1.44k/1.44k [00:00<00:00, 1.09MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████| 50.8k/50.8k [00:00<00:00, 299kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 9.45G/9.45G [1:46:29<00:00, 1.48MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████| 1.95G/1.95G [23:13<00:00, 1.40MB/s]
input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids)
/media/me/360work/Neural/venv/lib/python3.8/site-packages/transformers/generation/utils.py:1387: UserWarning: Neither max_length nor max_new_tokens has been set, max_length will default to 20 (self.config.max_length). Controlling max_length via the config is deprecated and max_length will be removed from the config in v5 of Transformers -- we recommend using max_new_tokens to control the maximum length of the generation.
warnings.warn(
There are some issues for FP16 for flan-t5 model (except the smallest one). You can try to use full precision torch_dtype=torch.float32 or torch.bfloat16 if the GPU is supported.
`>>> import torch