Closed lixiangMindSpore closed 2 years ago
Just spent a good hour creating this wrapper that allows working with legacy (OpenAI) CLIP and SLIP interchangeably, it's a bit hacky, but it works
import sys
from collections import OrderedDict
import torch
import torch.nn as nn
from torchvision import transforms
from CLIP import clip
def normalize(img, input_range = None):
if input_range is None:
minv = img.min()
else:
minv = input_range[0]
img = img - minv
if input_range is None:
maxv = img.max()
else:
maxv = input_range[1] - minv
if maxv != 0:
img = img / maxv
return img
def adjust_range(img, out_range, input_range = None):
img = normalize(img, input_range = input_range)
img = img * (out_range[1] - out_range[0])
img = img + out_range[0]
return img
class CLIP_Base():
# Default CLIP model from OpenAI
def __init__(self, model):
self.device = "cuda"
self.model = model.eval()
self.preprocess_transform = transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])
def preprocess(self, imgs, input_range = None):
imgs = adjust_range(imgs, [0.,1.], input_range = input_range)
return self.preprocess_transform(imgs)
def encode_img(self, imgs, input_range = None, apply_preprocess = True):
if apply_preprocess:
imgs = self.preprocess(imgs, input_range = None)
img_embeddings = self.model.encode_image(imgs)
return img_embeddings / img_embeddings.norm(dim=-1, keepdim=True)
def encode_text(self, texts):
text_embeddings = torch.stack([self.model.encode_text(clip.tokenize(text).to(self.device)).detach().clone() for text in texts])
return text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
# TODO: this is very hacky, must fix this later
sys.path.append('/home/Desktop/GitHub_Projects/SLIP')
import models
from tokenizer import SimpleTokenizer
import utils
class SLIP_Base():
def __init__(self, model_name):
self.device = "cuda"
if model_name == "SLIP_VITB16":
ckpt_path = "/home/GitHub_Projects/SLIP/pretrained_models/slip_base_100ep.pt"
elif model_name == "SLIP_VITS16":
ckpt_path = "/home/GitHub_Projects/SLIP/pretrained_models/slip_small_100ep.pt"
self.preprocess_transform = transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
self.tokenizer = SimpleTokenizer()
ckpt = torch.load(ckpt_path, map_location='cpu')
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
state_dict[k.replace('module.', '')] = v
# create model
old_args = ckpt['args']
old_args.model = model_name
model = getattr(models, old_args.model)(rand_embed=False,
ssl_mlp_dim=old_args.ssl_mlp_dim, ssl_emb_dim=old_args.ssl_emb_dim)
model.cuda().requires_grad_(False).eval()
model.load_state_dict(state_dict, strict=True)
n_params = sum(p.numel() for p in model.parameters())
print("Loaded perceptor %s: %.2fM params" %(model_name, (n_params/1000000)))
self.model = utils.get_model(model)
def preprocess(self, imgs, input_range = None):
imgs = adjust_range(imgs, [0.,1.], input_range = input_range)
return self.preprocess_transform(imgs)
def encode_img(self, imgs, input_range = None, apply_preprocess = True):
if apply_preprocess:
imgs = self.preprocess(imgs, input_range = input_range)
image_features = self.model.encode_image(imgs)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
return image_features
def encode_text(self, texts):
texts = self.tokenizer(texts).cuda(non_blocking=True)
texts = texts.view(-1, 77).contiguous()
text_embeddings = self.model.encode_text(texts)
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
return text_embeddings.unsqueeze(1)
def get_clip_perceptor(clip_model_name):
if clip_model_name in ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']:
perceptor, preprocess = clip.load(clip_model_name, jit=False, device = "cuda")
perceptor = perceptor.requires_grad_(False).eval()
n_params = sum(p.numel() for p in perceptor.parameters())
print("Loaded CLIP %s: %.2fM params" %(clip_model_name, (n_params/1000000)))
clip_perceptor = CLIP_Base(perceptor, preprocess)
else:
clip_perceptor = SLIP_Base(clip_model_name)
return clip_perceptor
Just spent a good hour creating this wrapper that allows working with legacy (OpenAI) CLIP and SLIP interchangeably, it's a bit hacky, but it works
import sys from collections import OrderedDict import torch import torch.nn as nn from torchvision import transforms from CLIP import clip def normalize(img, input_range = None): if input_range is None: minv = img.min() else: minv = input_range[0] img = img - minv if input_range is None: maxv = img.max() else: maxv = input_range[1] - minv if maxv != 0: img = img / maxv return img def adjust_range(img, out_range, input_range = None): img = normalize(img, input_range = input_range) img = img * (out_range[1] - out_range[0]) img = img + out_range[0] return img class CLIP_Base(): # Default CLIP model from OpenAI def __init__(self, model): self.device = "cuda" self.model = model.eval() self.preprocess_transform = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) ]) def preprocess(self, imgs, input_range = None): imgs = adjust_range(imgs, [0.,1.], input_range = input_range) return self.preprocess_transform(imgs) def encode_img(self, imgs, input_range = None, apply_preprocess = True): if apply_preprocess: imgs = self.preprocess(imgs, input_range = None) img_embeddings = self.model.encode_image(imgs) return img_embeddings / img_embeddings.norm(dim=-1, keepdim=True) def encode_text(self, texts): text_embeddings = torch.stack([self.model.encode_text(clip.tokenize(text).to(self.device)).detach().clone() for text in texts]) return text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) # TODO: this is very hacky, must fix this later sys.path.append('/home/Desktop/GitHub_Projects/SLIP') import models from tokenizer import SimpleTokenizer import utils class SLIP_Base(): def __init__(self, model_name): self.device = "cuda" if model_name == "SLIP_VITB16": ckpt_path = "/home/GitHub_Projects/SLIP/pretrained_models/slip_base_100ep.pt" elif model_name == "SLIP_VITS16": ckpt_path = "/home/GitHub_Projects/SLIP/pretrained_models/slip_small_100ep.pt" self.preprocess_transform = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) self.tokenizer = SimpleTokenizer() ckpt = torch.load(ckpt_path, map_location='cpu') state_dict = OrderedDict() for k, v in ckpt['state_dict'].items(): state_dict[k.replace('module.', '')] = v # create model old_args = ckpt['args'] old_args.model = model_name model = getattr(models, old_args.model)(rand_embed=False, ssl_mlp_dim=old_args.ssl_mlp_dim, ssl_emb_dim=old_args.ssl_emb_dim) model.cuda().requires_grad_(False).eval() model.load_state_dict(state_dict, strict=True) n_params = sum(p.numel() for p in model.parameters()) print("Loaded perceptor %s: %.2fM params" %(model_name, (n_params/1000000))) self.model = utils.get_model(model) def preprocess(self, imgs, input_range = None): imgs = adjust_range(imgs, [0.,1.], input_range = input_range) return self.preprocess_transform(imgs) def encode_img(self, imgs, input_range = None, apply_preprocess = True): if apply_preprocess: imgs = self.preprocess(imgs, input_range = input_range) image_features = self.model.encode_image(imgs) image_features = image_features / image_features.norm(dim=-1, keepdim=True) return image_features def encode_text(self, texts): texts = self.tokenizer(texts).cuda(non_blocking=True) texts = texts.view(-1, 77).contiguous() text_embeddings = self.model.encode_text(texts) text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) return text_embeddings.unsqueeze(1) def get_clip_perceptor(clip_model_name): if clip_model_name in ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']: perceptor, preprocess = clip.load(clip_model_name, jit=False, device = "cuda") perceptor = perceptor.requires_grad_(False).eval() n_params = sum(p.numel() for p in perceptor.parameters()) print("Loaded CLIP %s: %.2fM params" %(clip_model_name, (n_params/1000000))) clip_perceptor = CLIP_Base(perceptor, preprocess) else: clip_perceptor = SLIP_Base(clip_model_name) return clip_perceptor
Thank you so much. In addition, for class CLIP_Base() and class SLIP_Base(), to adapt clip_perceptor = CLIP_Base(perceptor, preprocess) or SLIP_Base(clip_model_name) do I need to add forward()?In this condition, how to write?
Thank you so much. In addition, for class CLIP_Base() and class SLIP_Base(), to adapt clip_perceptor = CLIP_Base(perceptor, preprocess) or SLIP_Base(clip_model_name) do I need to add forward()?In this condition, how to write?
you don't have to add the forward() method. you can do something like this.
import os
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SLIP_Base('SLIP_VITB16')
preprocess = transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)
def get_features(dataset):
all_features = []
all_labels = []
with torch.no_grad():
for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
features = model.encode_img(images.to(device),apply_preprocess=False)
all_features.append(features)
all_labels.append(labels)
return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)
# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)
# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")
How to use SLIP to predict the specific picture? Such as CLIP:
import torch import clip from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("D:\OD\CLIP\ViT-B-32.pt", device=device)
image = preprocess(Image.open("fuliqiang.png")).unsqueeze(0).to(device) text = clip.tokenize(["sleep", "play cellphone", "work"]).to(device)
with torch.no_grad(): image_features = model.encode_image(image) text_features = model.encode_text(text)
print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]