Stability-AI / stablediffusion

High-Resolution Image Synthesis with Latent Diffusion Models
MIT License
38.33k stars 4.95k forks source link

super resolution infer code #288

Open sunmeng7 opened 1 year ago

sunmeng7 commented 1 year ago

I write a super resolution infer code, but there are some params setting issues. My code as follows:

import torch
import numpy as np
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat, rearrange
from pytorch_lightning import seed_everything
from imwatermark import WatermarkEncoder

from scripts.txt2img import put_watermark
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.util import exists, instantiate_from_config
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '1'
config = 'configs/stable-diffusion/x4-upscaling.yaml'
ckpt = 'x4-upscaler-ema.ckpt'
num_samples = 1
h = 64  # 64 * factor(4) = 256
w = 64
seed = 50000
noise_level = torch.tensor([2])

def make_noise_augmentation(model, batch, noise_level=None):
    x_low = batch[model.low_scale_key]
    x_low = x_low.to(memory_format=torch.contiguous_format).float()
    x_aug, noise_level = model.low_scale_model(x_low, noise_level)
    return x_aug, noise_level

config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)

device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
image_path = 'dataset/ffhq_64_512/lr_64/00030.png'
txt = 'This a person.'
image = Image.open(image_path)

image = image.convert("RGB")

noise_level = torch.Tensor(num_samples * [noise_level]).to(sampler.model.device).long()

image = np.array(image).astype(np.uint8)
image = (image/127.5 - 1.0).astype(np.float32)
image = torch.from_numpy(image)
batch = {
    "lr": rearrange(image, 'h w c -> 1 c h w'),
    "txt": num_samples * [txt],
}
batch["lr"] = repeat(batch["lr"].to(device=device), "1 ... -> n ...", n=num_samples)
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()

x_augment, noise_level = make_noise_augmentation(model, batch, noise_level)
cond = {"c_concat": [x_augment], "c_crossattn": [c], "c_adm": noise_level}
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [x_augment], "c_crossattn": [uc_cross], "c_adm": noise_level}
shape = [model.channels, h, w]
seed_everything(seed)
prng = np.random.RandomState(seed)
start_code = prng.randn(num_samples, model.channels, h, w)
start_code = torch.from_numpy(start_code).to(
    device=device, dtype=torch.float32)

samples, intermediates = sampler.sample(
            # steps=200,
            S=50,
            batch_size=num_samples,
            # num_samples=1,
            shape=shape,
            conditioning=cond,
            verbose=False,
            eta=0.0,
            unconditional_guidance_scale=0.08333,
            # unconditional_guidance_scale=7.0,
            # unconditional_guidance_scale=1.0,
            unconditional_conditioning=uc_full,
            x_T=start_code,
            callback=None
        )

x_samples_ddim = model.decode_first_stage(samples)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)

wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
result1 = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
for img in result1:
    output = put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder)
    output.save('2.png')

My input image is 00030

Its high resolution image is 00030

But my result is 2 1

I adjust the seed and noise_level, is there any suitable values?