I have some code that I've been using for experimentation with outpainting from a few months ago. If it works for you, for the time being, it'd be awesome.
Code
```py
from enum import Enum
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from PIL import Image
from diffusers import StableDiffusionInpaintPipeline
from tqdm.auto import tqdm
class InpaintWalkType(str, Enum):
"""
Enum for inpainting walk types.
"""
UP = "up"
DOWN = "down"
LEFT = "left"
RIGHT = "right"
FORWARD = "forward"
BACKWARD = "backward"
# TODO(aryan): For now, this derives from StableDiffusionInpaintPipeline which limits
# the use of other models. It can be made more generic by allowing the base model to be
# passed in as an argument when constructing the pipeline.
class OutpaintPipeline(StableDiffusionInpaintPipeline):
@staticmethod
def _calculate_translation_per_frame(
translation: int,
translation_frames: int,
) -> List[int]:
"""Helper function to calculate translation per frame."""
step_size = translation // translation_frames
remainder = translation % translation_frames
values = [step_size + (i < remainder) for i in range(translation_frames)]
return values
@staticmethod
def _translate_image_and_mask(
image: Image.Image,
walk_type: InpaintWalkType,
translation: Union[int, Tuple[int, int]],
mask: Optional[np.ndarray] = None,
) -> Tuple[Image.Image, Image.Image]:
"""Helper function to translate image and mask in given direction by specified translation."""
def apply_translation(dx: int, dy: int) -> Image.Image:
return image.transform(
(image.width, image.height),
Image.AFFINE,
(1, 0, dx, 0, 1, dy),
resample=Image.BICUBIC,
)
if walk_type == InpaintWalkType.UP:
if mask is not None:
mask[:translation, :] = 255
new_image = apply_translation(0, -translation)
elif walk_type == InpaintWalkType.DOWN:
if mask is not None:
mask[-translation:, :] = 255
new_image = apply_translation(0, translation)
elif walk_type == InpaintWalkType.LEFT:
if mask is not None:
mask[:, :translation] = 255
new_image = apply_translation(-translation, 0)
elif walk_type == InpaintWalkType.RIGHT:
if mask is not None:
mask[:, -translation:] = 255
new_image = apply_translation(translation, 0)
elif (
walk_type == InpaintWalkType.FORWARD
or walk_type == InpaintWalkType.BACKWARD
):
tw, th = translation
if mask is not None:
mask[:th, :] = 255
mask[-th:, :] = 255
mask[:, :tw] = 255
mask[:, -tw:] = 255
downsampled_image = image.resize(
(image.width - 2 * tw, image.height - 2 * th), resample=Image.LANCZOS
)
new_image = Image.new("RGB", (image.width, image.height))
new_image.paste(downsampled_image, (tw, th))
return new_image, Image.fromarray(mask) if mask is not None else None
@staticmethod
def _generate_filler_frames(
start_image: Image.Image,
end_image: Image.Image,
walk_type: InpaintWalkType,
actual_translation: Union[int, Tuple[int, int]],
filler_translations: Union[int, List[int]],
) -> List[Image.Image]:
"""Helper function to generate filler frames for given walk type."""
if (
walk_type == InpaintWalkType.FORWARD
or walk_type == InpaintWalkType.BACKWARD
):
if not isinstance(filler_translations, int):
raise ValueError(
f"filler_translations must be of type int for InpaintWalkType.FORWARD or InpaintWalkType.BACKWARD, got {type(filler_translations)}"
)
else:
if not isinstance(filler_translations, list):
raise ValueError(
f"filler_translations must be of type list for InpaintWalkType.UP, InpaintWalkType.DOWN, InpaintWalkType.LEFT or InpaintWalkType.RIGHT, got {type(filler_translations)}"
)
frames = []
width = start_image.width
height = start_image.height
if walk_type == InpaintWalkType.UP:
for filler_translation in filler_translations:
a = start_image.crop((0, 0, width, height - filler_translation))
b = end_image.crop(
(
0,
actual_translation - filler_translation,
width,
actual_translation,
)
)
result_img = Image.new("RGB", (width, height))
result_img.paste(b, (0, 0))
result_img.paste(a, (0, b.height))
frames.append(result_img)
elif walk_type == InpaintWalkType.DOWN:
for filler_translation in filler_translations:
a = start_image.crop((0, filler_translation, width, height))
b = end_image.crop(
(
0,
height - actual_translation,
width,
height - actual_translation + filler_translation,
)
)
result_img = Image.new("RGB", (width, height))
result_img.paste(a, (0, 0))
result_img.paste(b, (0, a.height))
frames.append(result_img)
elif walk_type == InpaintWalkType.LEFT:
for filler_translation in filler_translations:
a = start_image.crop((0, 0, width - filler_translation, height))
b = end_image.crop(
(
actual_translation - filler_translation,
0,
actual_translation,
height,
)
)
result_img = Image.new("RGB", (width, height))
result_img.paste(b, (0, 0))
result_img.paste(a, (b.width, 0))
frames.append(result_img)
elif walk_type == InpaintWalkType.RIGHT:
for filler_translation in filler_translations:
a = start_image.crop((filler_translation, 0, width, height))
b = end_image.crop(
(
width - actual_translation,
0,
width - actual_translation + filler_translation,
height,
)
)
result_img = Image.new("RGB", (width, height))
result_img.paste(a, (0, 0))
result_img.paste(b, (a.width, 0))
frames.append(result_img)
elif (
walk_type == InpaintWalkType.FORWARD
or walk_type == InpaintWalkType.BACKWARD
):
aw, ah = actual_translation
width_factor = 1 - 2 * aw / width
height_factor = 1 - 2 * ah / height
width_crop_factor = width - 2 * aw
height_crop_factor = height - 2 * ah
translated_image, _ = OutpaintPipeline._translate_image_and_mask(
start_image, walk_type, actual_translation, mask=None
)
translated_image = translated_image.convert("RGBA")
translated_image = np.array(translated_image)
translated_image[:ah, :, 3] = 0
translated_image[-ah:, :, 3] = 0
translated_image[:, :aw, 3] = 0
translated_image[:, -aw:, 3] = 0
translated_image = Image.fromarray(translated_image)
end_image.paste(translated_image, mask=translated_image)
for i in range(filler_translations - 1):
translation_factor = 1 - (i + 1) / filler_translations
interpolation_image = end_image
interpolation_width = round(
(1 - width_factor**translation_factor) * width / 2
)
interpolation_height = round(
(1 - height_factor**translation_factor) * height / 2
)
interpolation_image = interpolation_image.crop(
(
interpolation_width,
interpolation_height,
width - interpolation_width,
height - interpolation_height,
)
).resize((width, height), resample=Image.LANCZOS)
w = width - 2 * interpolation_width
h = height - 2 * interpolation_height
crop_fix_width = round((1 - width_crop_factor / w) * width / 2)
crop_fix_height = round((1 - height_crop_factor / h) * height / 2)
start_image_crop_fix, _ = OutpaintPipeline._translate_image_and_mask(
start_image, walk_type, (crop_fix_width, crop_fix_height), mask=None
)
start_image_crop_fix = start_image_crop_fix.convert("RGBA")
start_image_crop_fix = np.array(start_image_crop_fix)
start_image_crop_fix[:crop_fix_height, :, 3] = 0
start_image_crop_fix[-crop_fix_height:, :, 3] = 0
start_image_crop_fix[:, :crop_fix_width, 3] = 0
start_image_crop_fix[:, -crop_fix_width:, 3] = 0
start_image_crop_fix = Image.fromarray(start_image_crop_fix)
interpolation_image.paste(
start_image_crop_fix, mask=start_image_crop_fix
)
frames.append(interpolation_image)
frames.append(end_image)
return frames
@torch.no_grad()
def __call__(
self,
prompt: Union[str, List[str]],
image: Image.Image = None,
mask_image: Image.Image = None,
height: int = 512,
width: int = 512,
strength: float = 1,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
output_type: str = "pil",
return_dict: bool = True,
):
return super().__call__(
prompt=prompt,
image=image,
mask_image=mask_image,
height=height,
width=width,
strength=strength,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
negative_prompt=negative_prompt,
generator=generator,
output_type=output_type,
return_dict=return_dict,
)
generate = __call__
@torch.no_grad()
def walk(
self,
prompt: Union[str, List[str]],
image: Image.Image = None,
walk_type: Union[
InpaintWalkType, List[InpaintWalkType]
] = InpaintWalkType.BACKWARD,
height: int = 512,
width: int = 512,
height_translation_per_step: int = 64,
width_translation_per_step: int = 64,
translation_factor: Optional[float] = None,
strength: float = 1,
num_inference_steps: int = 50,
num_interpolation_steps: int = 60,
as_video: bool = True,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
output_type: str = "pil",
**kwargs,
):
if translation_factor is not None:
if translation_factor < 0 or translation_factor > 1:
raise ValueError(
f"translation_factor must be between 0 and 1, got {translation_factor}"
)
height_translation_per_step = int(height * translation_factor)
width_translation_per_step = int(width * translation_factor)
if isinstance(walk_type, str):
walk_type = InpaintWalkType(walk_type)
if isinstance(walk_type, InpaintWalkType):
walk_type = [walk_type]
if isinstance(walk_type, list):
if isinstance(walk_type[0], str):
walk_type = [InpaintWalkType(x) for x in walk_type]
else:
raise TypeError(
f"walk_type must be of type InpaintWalkType or List[InpaintWalkType], got {type(walk_type)}"
)
num_inpainting_steps = len(walk_type)
if prompt is not None:
if isinstance(prompt, str):
prompt = [prompt]
if len(prompt) == 1:
prompt = prompt * num_inpainting_steps
if len(prompt) != num_inpainting_steps:
raise ValueError(
f"prompt must have length of num_inpainting_steps, got {len(walk_type)} and {num_inpainting_steps}"
)
if negative_prompt is not None:
if isinstance(negative_prompt, str):
negative_prompt = [negative_prompt]
if len(negative_prompt) == 1:
negative_prompt = negative_prompt * num_inpainting_steps
if len(negative_prompt) != num_inpainting_steps:
raise ValueError(
f"negative_prompt must have length of num_inpainting_steps, got {len(walk_type)} and {num_inpainting_steps}"
)
walk_has_backward = InpaintWalkType.BACKWARD in walk_type
walk_has_forward = InpaintWalkType.FORWARD in walk_type
if walk_has_backward or walk_has_forward:
if height_translation_per_step * 2 > height:
raise ValueError(
f"height_translation_per_step must be less than half of height, got {height_translation_per_step} and {height}"
)
if width_translation_per_step * 2 > width:
raise ValueError(
f"width_translation_per_step must be less than half of width, got {width_translation_per_step} and {width}"
)
else:
if height_translation_per_step >= height:
raise ValueError(
f"height_translation_per_step must be less than height, got {height_translation_per_step} and {height}"
)
if width_translation_per_step >= width:
raise ValueError(
f"width_translation_per_step must be less than width, got {width_translation_per_step} and {width}"
)
height_filler_translations = self._calculate_translation_per_frame(
translation=height_translation_per_step,
translation_frames=num_interpolation_steps,
)
width_filler_translations = self._calculate_translation_per_frame(
translation=width_translation_per_step,
translation_frames=num_interpolation_steps,
)
for i in range(1, num_interpolation_steps):
height_filler_translations[i] += height_filler_translations[i - 1]
width_filler_translations[i] += width_filler_translations[i - 1]
assert height_filler_translations[-1] == height_translation_per_step
assert width_filler_translations[-1] == width_translation_per_step
image = image.resize((height, width), resample=Image.LANCZOS).convert("RGB")
base_mask = np.zeros((width, height), dtype=np.uint8)
prev_image = image
frames = []
index = 0
for prompt, negative_prompt, walk in tqdm(
zip(prompt, negative_prompt, walk_type)
):
if walk == InpaintWalkType.LEFT or walk == InpaintWalkType.RIGHT:
translation = width_translation_per_step
filler_translations = width_filler_translations
elif walk == InpaintWalkType.UP or walk == InpaintWalkType.DOWN:
translation = height_translation_per_step
filler_translations = height_filler_translations
elif walk == InpaintWalkType.BACKWARD:
translation = (width_translation_per_step, height_translation_per_step)
filler_translations = num_interpolation_steps
elif walk == InpaintWalkType.FORWARD:
raise ValueError(
"InpaintWalkType.FORWARD is not supported yet. If you would like to do this, reverse the sequence of images generated using InpaintWalkType.BACKWARD"
)
else:
raise ValueError(f"Invalid Inpaint Walk Type: {walk}")
image, mask = self._translate_image_and_mask(
prev_image, walk, translation, mask=base_mask.copy()
)
generated_image: Image.Image = self.generate(
prompt=prompt,
image=image,
mask_image=mask,
height=height,
width=width,
strength=strength,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
negative_prompt=negative_prompt,
generator=generator,
output_type=output_type,
return_dict=True,
**kwargs,
).images[0]
if as_video:
filler_frames_list = self._generate_filler_frames(
start_image=prev_image,
end_image=generated_image,
walk_type=walk,
actual_translation=translation,
filler_translations=filler_translations,
)
prev_image = filler_frames_list[-1].copy()
if walk == InpaintWalkType.FORWARD:
filler_frames_list = filler_frames_list[::-1]
frames.extend(filler_frames_list)
if index < num_inpainting_steps - 1:
frames.pop()
else:
prev_image = generated_image.copy()
frames = [generated_image]
index += 1
return frames
```
Usage
```py
from diffusers import StableDiffusionInpaintPipeline
inpaint_model = StableDiffusionInpaintPipeline.from_pretrained(
"runwayml/stable-diffusion-inpainting",
torch_dtype=torch.float32,
use_safetensors=True,
variant="fp16",
).to("cuda:0")
inpaint_model.scheduler = DPMSolverMultistepScheduler.from_config(
inpaint_model.scheduler.config
)
outpaint_model = OutpaintPipeline(
text_encoder=inpaint_model.text_encoder,
tokenizer=inpaint_model.tokenizer,
unet=inpaint_model.unet,
vae=inpaint_model.vae,
scheduler=inpaint_model.scheduler,
safety_checker=None,
feature_extractor=inpaint_model.feature_extractor,
)
from diffusers.schedulers import DPMSolverMultistepScheduler
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
from diffusers.utils import load_image
image = load_image(...)
walk = [InpaintWalkType.RIGHT, InpaintWalkType.RIGHT, InpaintWalkType.BACKWARD, InpaintWalkType.BACKWARD]
images = pipe.walk(
prompt="A painting of a cat, in the style of Vincent Van Gogh, hanging in a room",
negative_prompt="low quality",
image=image,
walk_type=walk,
image_height=512,
image_width=512,
height_translation_per_step=32,
width_translation_per_step=32,
num_interpolation_steps=30,
num_inference_steps=15,
guidance_scale=10.0,
as_video=True,
)
def pil_to_video(images: List[Image.Image], filename: str, fps: int = 60) -> None:
"""
Convert a list of PIL images to a video.
Parameters
----------
images: List[Image.Image]
List of PIL images.
filename: str
Filename to save video to.
fps: int
Frames per second of video.
"""
frames = [np.array(image) for image in images]
with imageio.get_writer(filename, fps=fps) as video_writer:
for frame in frames:
video_writer.append_data(frame)
pil_to_video(images, "output.mp4", fps=60)
```
Results
Images
Videos
This only supports outpainting in left, right, up, down, backward directions and their combinations. For forward, you can reverse the video of backward. This is actually quite restricting in terms of usage and I'd like to try and implement this in a way that allows for outpainting in any direction. I've seen some pretty cool posts on reddit/twitter about people doing spiral outpainting and what not, and having a pipeline that could do all of that out-of-the-box would be awesome!
@yiyixuxu @patrickvonplaten If you'd like, I could convert this into a community pipeline :)
@a-r-r-o-w Hi, I am very interested in outpainting. Can you share it with me
Checkout the code and usage in my previous reply @Brembles :) It doesn't support much but is good enough to get started I think. Will add more features if there's interest to add as community pipeline.
查看我之前回复中的代码和使用方法@Brembles:) 它支持不多,但我认为足够开始使用。如果有兴趣添加社区管道,将添加更多功能。
I discovered a problem. When the strength parameter is adjusted to any parameter other than 1, outpainting will be invalid, even 0.99.
@Brembles I've come across the issue too. The reason, I believe, is that when we're performing the outpainting, the transformations on the image is done on a black canvas (the example below is outpainting in backward direction).
This causes loss of information and results in bad quality unless the noise added is completely random (the case with strength=1). This implementation is a very rough combination of many ideas by others and there's a lot of room for improvement. I'm a bit busy this week but I'll try improving and having something better ready over the weekend.
One quick solution that seems to produce better results for other strength values is performing the transformations with the original image as canvas instead of the black background. Just modify the code for pasting transformed image on the original image in _translate_image_and_mask.
Hi, if I don’t have the original image and it is a complete outpainting task, what should I do? I have tried some methods but the results are not ideal.
The current pipeline above is very basic and doesn't support XL. It should be quite easy to make it work with it though and can be done by deriving from SDXL and making necessary changes to __call__. I do plan on improving it but am quite busy with university and other contributions at the moment. It would be great if someone from the community could improve on this :)
I have some code that I've been using for experimentation with outpainting from a few months ago. If it works for you, for the time being, it'd be awesome.
this is awesome can you tell me how can i use this code?
@patrickvonplaten @sayakpaul @DN6 is this something that we could work on adding? There seems to a good amount of interest in diffusers supporting an infinite canvas outpainting pipeline similar to RunwayML's Infinite Image. I hope it'll be quite easy to replicate with high quality if done correctly.
@nanaj96 @a-r-r-o-w This can be added as a community pipeline
Thank you. I'll take a better look and implement the SDXL versions as well very soon in an improved pipeline. There are many issues with what was shared above.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
@sayakpaul Apologies for the delay here. I just haven't found time to get back to it. Most people use ComfyUI or other alternatives for outpainting but an example implementation, possibly as a community Mixin (because the changes are relatively simple) is something others from the community can work on if there's still a demand. So, maybe it'd be a good idea to add a contributions-welcome label for someone looking to get started with diffusers. Thanks
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
outpainting is very interesting
does diffusers support it?