Open xiankgx opened 1 week ago
Then I try to modify the code to do actual outpainting by adding padding, and this is what I get. It didn't generate anything in the masked areas, but take original values from the input image. I attribute this to the strength=0.9999 parameter which somehow leak the original pixels in the masked area to the generated image.
Changing the strength to 1.0 makes things kinda work, but the masked areas are not coherent with the unmasked areas.
Here is my src code. What do you see could be wrong? @xinsir6
# diffusers测试ControlNet
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import sys
sys.path.append('..')
import cv2
import copy
import torch
import random
import numpy as np
from PIL import Image
from mask import get_mask_generator
from diffusers.utils import load_image
from diffusers import EulerAncestralDiscreteScheduler, AutoencoderKL
from models.controlnet_union import ControlNetModel_Union
from pipeline.pipeline_controlnet_union_inpaint_sd_xl import StableDiffusionXLControlNetUnionInpaintPipeline
device=torch.device('cuda:0')
eulera_scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
# Note you should set the model and the config to the promax version manually, default is not the promax version.
from huggingface_hub import snapshot_download
snapshot_download(repo_id="xinsir/controlnet-union-sdxl-1.0", local_dir='controlnet-union-sdxl-1.0')
# you should make a new dir controlnet-union-sdxl-1.0-promax and mv the promax config and promax model into it and rename the promax config and the promax model.
controlnet_model = ControlNetModel_Union.from_pretrained("./controlnet-union-sdxl-1.0-promax", torch_dtype=torch.float16, use_safetensors=True)
pipe = StableDiffusionXLControlNetUnionInpaintPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet_model,
vae=vae,
torch_dtype=torch.float16,
# scheduler=ddim_scheduler,
scheduler=eulera_scheduler,
)
pipe = pipe.to(device)
def HWC3(x):
assert x.dtype == np.uint8
if x.ndim == 2:
x = x[:, :, None]
assert x.ndim == 3
H, W, C = x.shape
assert C == 1 or C == 3 or C == 4
if C == 3:
return x
if C == 1:
return np.concatenate([x, x, x], axis=2)
if C == 4:
color = x[:, :, 0:3].astype(np.float32)
alpha = x[:, :, 3:4].astype(np.float32) / 255.0
y = color * alpha + 255.0 * (1.0 - alpha)
y = y.clip(0, 255).astype(np.uint8)
return y
mask_gen_kwargs = {
"min_padding_percent": 0.06,
"max_padding_percent": 0.30,
"left_padding_prob": 0.5,
"top_padding_prob": 0.5,
"right_padding_prob": 0.5,
"bottom_padding_prob": 0.5
}
mask_gen = get_mask_generator(kind='outpainting', kwargs=mask_gen_kwargs)
prompt = "Couple, walking and mountain travel for holiday, adventure and happy journey for bonding in nature. Outdoor, people and honeymoon vacation or date together, explore and love in jungle or wilderness"
negative_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
seed = random.randint(0, 2147483647)
# The original image you want to repaint.
import io
import requests
from PIL import Image
def load_image(path):
if path.startswith("http"):
req = requests.get(path)
image = Image.open(io.BytesIO(req.content))
else:
image = Image.open(path)
return np.array(image)[:,:,::-1] # RGB -> BGR
# original_img = cv2.imread("your image path")
original_img = load_image("https://us.123rf.com/450wm/peopleimages12/peopleimages122405/peopleimages12240520087/229968872-couple-walking-and-mountain-travel-for-holiday-adventure-and-happy-journey-for-bonding-in-nature-out.jpg?ver=6")
# Note that outpainting now only support square outpainting, that means the border line should be parallel with the image boundary line
# mask = cv2.imread("your mask image path")
original_code = False
if original_code:
height, width, _ = original_img.shape
ratio = np.sqrt(1024. * 1024. / (width * height))
W, H = int(width * ratio) // 8 * 8, int(height * ratio) // 8 * 8
original_img = cv2.resize(original_img, (W, H))
original_img = cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB)
import copy
controlnet_img = copy.deepcopy(original_img)
controlnet_img = np.transpose(controlnet_img, (2, 0, 1))
mask = mask_gen(controlnet_img)
controlnet_img = np.transpose(controlnet_img, (1, 2, 0))
mask = np.transpose(mask, (1, 2, 0))
controlnet_img[mask.squeeze() > 0.0] = 0
mask = HWC3((mask * 255).astype('uint8'))
controlnet_img = Image.fromarray(controlnet_img)
original_img = Image.fromarray(original_img)
mask = Image.fromarray(mask)
else:
def create_controlnet_outpaint_inputs(ori_image: Image.Image, l: float=0, r: float=0, t:float=0, b:float=0):
ori_image_np = np.array(ori_image)
h, w = ori_image_np.shape[:2]
l = int(l * w)
r = int(r * w)
t = int(t * h)
b = int(b * h)
mask_np = np.zeros((h, w), dtype=np.uint8)
padded_ori_image_np = np.pad(ori_image_np, [(t, b), (l, r), (0, 0)], "constant", constant_values=0)
padded_ori_image_np2 = np.pad(ori_image_np, [(t, b), (l, r), (0, 0)], "constant", constant_values=255)
padded_mask_np = np.pad(mask_np, [(t, b), (l, r)], "constant", constant_values=255)
return Image.fromarray(padded_ori_image_np2), Image.fromarray(padded_ori_image_np), Image.fromarray(padded_mask_np).convert("RGB")
original_img = Image.fromarray(original_img[:,:,::-1])
original_img, controlnet_img, mask = create_controlnet_outpaint_inputs(
original_img,
0.2, 0.3, 0.2, 0.3
)
width, height = controlnet_img.size
ratio = np.sqrt(1024. * 1024. / (width * height))
W, H = int(width * ratio) // 8 * 8, int(height * ratio) // 8 * 8
original_img = original_img.resize((W, H))
controlnet_img = controlnet_img.resize((W, H))
mask = mask.resize((W, H))
print(f"H: {H}, W: {W}")
width, height = W, H
# 0 -- openpose
# 1 -- depth
# 2 -- hed/pidi/scribble/ted
# 3 -- canny/lineart/anime_lineart/mlsd
# 4 -- normal
# 5 -- segment
# 6 -- tile
# 7 -- repaint
original_img.save("original_img.jpg")
controlnet_img.save("controlnet_img.jpg")
mask.save("mask.png")
images = pipe(prompt=[prompt]*1,
image=original_img,
mask_image=mask,
control_image_list=[0, 0, 0, 0, 0, 0, 0, controlnet_img],
negative_prompt=[negative_prompt]*1,
# generator=generator,
width=width,
height=height,
num_inference_steps=30,
strength=1.0,
union_control=True,
union_control_type=torch.Tensor([0, 0, 0, 0, 0, 0, 0, 1]),
).images
images = Image.fromarray(np.concatenate(list(map(np.array, images)), axis=1))
images.save("generated.jpg")
By looking at the original/input image, and how strength=0.9999 in the pipeline call in the original src code, it seems that outpainting worked because it is taking hint from the original pixels in the masked areas. While this work for trying to replace the masked area with something very similar in inpainting, it does not work to generate new content for outpainting.
What do you think?
I cloned this repo, cd to promax directory and ran the controlnet_union_test_outpainting.py script.
With your original script, it seems that it could get outpainting to work. Here are the original image resized to the generated size, mask, controlnet image, and generated image.