Open klei22 opened 1 year ago
Same here. I'm using torch==1.12.1+cu113
and xformers=0.0.20+6425fd0.d20230429
. xformers is built from source.
i had the same issue, my solution was to switch to pytorch 2.0.0+cu118 and disable the xformers memory efficient attention
same here
same here
@kanttouchthis Thanks, this worked for me as well!
In summary:
After installing the torch 2.0 with cuda support:
pip install -r requirements.txt # after removing torch
pip install transformers --upgrade
pip install accelerate --upgrade
Then the above test code ran without errors.
Same here. I want to use xformers because I want to run deepfloyd on anything less than torch v2. If I don't use it I get an OOM error.
Same here. I want to use xformers because I want to run deepfloyd on anything less than torch v2. If I don't use it I get an OOM error.
pytorch 2.0 automatically applies the same memory efficient attention that xformers offers (see here). How much VRAM do you have? You can run the diffusers code with very little VRAM by using cpu offloading or sequential offloading:
#16 GB
stage_1.enable_model_cpu_offload()
stage_2.enable_model_cpu_offload()
stage_3.enable_model_cpu_offload()
#6 GB
stage_1.enable_sequential_cpu_offload()
stage_2.enable_model_cpu_offload()
stage_3.enable_model_cpu_offload()
pytorch 2.0 automatically applies the same memory efficient attention that xformers offers (see here). How much VRAM do you have? You can run the diffusers code with very little VRAM by using cpu offloading or sequential offloading:
#16 GB stage_1.enable_model_cpu_offload() stage_2.enable_model_cpu_offload() stage_3.enable_model_cpu_offload()
#6 GB stage_1.enable_sequential_cpu_offload() stage_2.enable_model_cpu_offload() stage_3.enable_model_cpu_offload()
Sure it works. But it's important to me that it works with pytorch < 2
I have the same issue because diffuser has explicit exception:
Is there a solution to this problem with torch 1.x?
I found the follow solution with torch 1.x: load a model with 8bit text encoder and do not use xformers at all stages except upscale stage.
For 8bit loading you need bitsandbytes . Probably, it is possible to use XL models but reusing memory on the GPU:
# load stage 1
pipe = ...
del pipe
# load stage 2
pipe =
I run it on the 24 GB GPU, but you can use smaller models.
import torch
from diffusers import (DiffusionPipeline, IFPipeline,
IFSuperResolutionPipeline,
StableDiffusionUpscalePipeline)
from transformers import T5EncoderModel
if __name__ == "__main__":
stage_1_model_name = "DeepFloyd/IF-I-L-v1.0"
stage_2_model_name = "DeepFloyd/IF-II-L-v1.0"
stage_3_model_name = "stabilityai/stable-diffusion-x4-upscaler"
text_encoder = T5EncoderModel.from_pretrained(
stage_1_model_name,
subfolder="text_encoder",
device_map="auto",
load_in_8bit=True,
variant="8bit",
torch_dtype=torch.float16
)
text_encoder_pipeline = DiffusionPipeline.from_pretrained(
stage_1_model_name,
text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder
unet=None,
device_map="auto",
)
text_encoder_pipeline.save_pretrained("checkpoints/text_encoder")
pipe1 = IFPipeline.from_pretrained(
stage_1_model_name,
text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder
watermarker=None,
feature_extractor=None,
safety_checker=None,
require_safety_checker=False
)
pipe1.save_pretrained("checkpoints/stage1")
del pipe1
pipe2 = IFSuperResolutionPipeline.from_pretrained(
stage_2_model_name, feature_extractor=None, safety_checker=None, watermarker=None,
text_encoder=None, variant="fp16", torch_dtype=torch.float16,
require_safety_checker=False
)
pipe2.save_pretrained("checkpoints/stage2")
del pipe2
pipe3 = StableDiffusionUpscalePipeline.from_pretrained(
stage_3_model_name,
feature_extractor=None,
safety_checker=None,
watermarker=None,
variant="fp16",
torch_dtype=torch.float16,
)
pipe3.save_pretrained("checkpoints/stage3-super-resol")
import os
from typing import Optional
import torch
from diffusers import (DiffusionPipeline, IFPipeline,
IFSuperResolutionPipeline,
StableDiffusionUpscalePipeline)
class DeeplFloyd:
def __init__(self, checkpoint_dir: str, device: torch.device):
self._device = device
self._text_encoder_pipeline = DiffusionPipeline.from_pretrained(
os.path.join(checkpoint_dir, "text_encoder"),
device_map="auto",
unet=None,
local_files_only=True,
low_cpu_mem_usage=True,
)
self._text_encoder_pipeline.to(device)
self._pipe_stage_1 = IFPipeline.from_pretrained(
os.path.join(checkpoint_dir, "stage1"),
# pass the previously instantiated 8bit text encoder
text_encoder=self._text_encoder_pipeline.text_encoder,
watermarker=None,
feature_extractor=None,
safety_checker=None,
low_cpu_mem_usage=True,
local_files_only=True,
require_safety_checker=False
)
self._pipe_stage_1.set_progress_bar_config(mininterval=5)
self._pipe_stage_1.to(device)
self._pipe_stage_2 = IFSuperResolutionPipeline.from_pretrained(
os.path.join(checkpoint_dir, "stage2"),
feature_extractor=None, safety_checker=None, watermarker=None,
text_encoder=None,
variant="fp16",
torch_dtype=torch.float16,
require_safety_checker=False,
local_files_only=True,
low_cpu_mem_usage=True,
)
self._pipe_stage_2.set_progress_bar_config(mininterval=5)
self._pipe_stage_2.to(device)
self._pipe_stage_3 = StableDiffusionUpscalePipeline.from_pretrained(
os.path.join(checkpoint_dir, "stage3-super-resol"),
feature_extractor=None,
safety_checker=None,
watermarker=None,
variant="fp16",
local_files_only=True,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
self._pipe_stage_3.set_progress_bar_config(mininterval=5)
self._pipe_stage_3.enable_xformers_memory_efficient_attention()
self._pipe_stage_3.enable_model_cpu_offload()
self._generator = torch.Generator()
def __call__(self,
prompt: str,
seed: int,
num_inference_steps: int = 100,
num_upscale_steps: int = 70,
neg_prompt: Optional[str] = None):
generator = self._generator.manual_seed(seed)
prompt_embeds, negative_embeds = self._text_encoder_pipeline.encode_prompt(
prompt,
negative_prompt=neg_prompt)
with torch.autocast(self._pipe_stage_1.device.type):
image = self._pipe_stage_1(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
output_type="pt",
num_inference_steps=num_inference_steps,
generator=generator,
).images
with torch.autocast(self._pipe_stage_2.device.type):
image = self._pipe_stage_2(
image=image,
num_inference_steps=max(num_inference_steps // 2, 1),
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
output_type="pt",
generator=generator,
).images
images = self._pipe_stage_3(
image=image,
prompt=prompt,
num_inference_steps=num_upscale_steps,
generator=generator,
).images
return images[0]
After going through the README instructions, trying the following test script just to get started, however I am consistently receiving an error:
NotImplementedError: Memory efficient attention with
xformersis currently not supported when
self.added_kv_proj_dimis defined.
(full traceback shared after test code section):Testcode:
Error traceback: