Hi~, HPSv2 is really nice work. But when I reproduce the v2.1 benchmark, I can not get the same results reported in your readme. Could you tell me how to fix it please? These are my codes of jupyter notebook:
import torch
from PIL import Image
import hpsv2
from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
import warnings
import argparse
import os
import requests
from clint.textui import progress
from typing import Union
import huggingface_hub
from hpsv2.utils import root_path, hps_version_map
#warnings.filterwarnings("ignore", category=UserWarning)
def score(model, img_path, prompt) -> list:
if isinstance(img_path, list):
result = []
for one_img_path in img_path:
# Load your image and prompt
with torch.no_grad():
# Process the image
if isinstance(one_img_path, str):
image = preprocess_val(Image.open(one_img_path)).unsqueeze(0).to(device=device, non_blocking=True)
elif isinstance(one_img_path, Image.Image):
image = preprocess_val(one_img_path).unsqueeze(0).to(device=device, non_blocking=True)
else:
raise TypeError('The type of parameter img_path is illegal.')
# Process the prompt
text = tokenizer([prompt]).to(device=device, non_blocking=True)
# Calculate the HPS
with torch.cuda.amp.autocast():
outputs = model(image, text)
image_features, text_features = outputs["image_features"], outputs["text_features"]
logits_per_image = image_features @ text_features.T
hps_score = torch.diagonal(logits_per_image).cpu().numpy()
result.append(hps_score[0])
return result
elif isinstance(img_path, str):
# Load your image and prompt
with torch.no_grad():
# Process the image
image = preprocess_val(Image.open(img_path)).unsqueeze(0).to(device=device, non_blocking=True)
# Process the prompt
text = tokenizer([prompt]).to(device=device, non_blocking=True)
# Calculate the HPS
with torch.cuda.amp.autocast():
outputs = model(image, text)
image_features, text_features = outputs["image_features"], outputs["text_features"]
logits_per_image = image_features @ text_features.T
hps_score = torch.diagonal(logits_per_image).cpu().numpy()
return [hps_score[0]]
elif isinstance(img_path, Image.Image):
# Load your image and prompt
with torch.no_grad():
# Process the image
image = preprocess_val(img_path).unsqueeze(0).to(device=device, non_blocking=True)
# Process the prompt
text = tokenizer([prompt]).to(device=device, non_blocking=True)
# Calculate the HPS
with torch.cuda.amp.autocast():
outputs = model(image, text)
image_features, text_features = outputs["image_features"], outputs["text_features"]
logits_per_image = image_features @ text_features.T
hps_score = torch.diagonal(logits_per_image).cpu().numpy()
return [hps_score[0]]
else:
raise TypeError('The type of parameter img_path is illegal.')
For easily running with every image once, I split the original codes
model_dict = {}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, preprocess_train, preprocess_val = create_model_and_transforms(
'ViT-H-14',
'laion2B-s32B-b79K',
precision='amp',
device=device,
jit=False,
force_quick_gelu=False,
force_custom_text=False,
force_patch_dropout=False,
force_image_size=None,
pretrained_image=False,
image_mean=None,
image_std=None,
light_augmentation=True,
aug_cfg={},
output_dict=True,
with_score_predictor=False,
with_region_predictor=False
)
model_dict['model'] = model
model_dict['preprocess_val'] = preprocess_val
checkpoint = os.path.join(root_path,'HPS_v2_compressed.pt')
cp = None
hps_version = "v2.1"
model = model_dict['model']
preprocess_val = model_dict['preprocess_val']
# check if the checkpoint exists
if not os.path.exists(root_path):
os.makedirs(root_path)
if cp is None:
cp = huggingface_hub.hf_hub_download("xswu/HPSv2", hps_version_map[hps_version])
checkpoint = torch.load(cp, map_location=device)
model.load_state_dict(checkpoint['state_dict'])
tokenizer = get_tokenizer('ViT-H-14')
model = model.to(device)
model.eval()
Then I download the test data and reproduce the results of each categories(for example, photo).
from numpy import *
prompts = ["A man taking a drink from a water fountain.", ...]
root = '/my_path/HPDv2/SDXL-refiner-0.9/photo'
imgs = os.listdir(root)
imgs.sort()
ret = []
for i,n in enumerate(imgs):
print(n, prompts[i])
s = score(model, os.path.join(root, n), prompts[i])
ret.append(s)
print(mean(ret))
And I get (31.52 v.s. 33.26) for anime, (26.51 v.s. 28.38 ) for photo.
Hi~, HPSv2 is really nice work. But when I reproduce the v2.1 benchmark, I can not get the same results reported in your readme. Could you tell me how to fix it please? These are my codes of jupyter notebook:
For easily running with every image once, I split the original codes
Then I download the test data and reproduce the results of each categories(for example, photo).
And I get (31.52 v.s. 33.26) for anime, (26.51 v.s. 28.38 ) for photo.