facebookresearch / SLIP

Code release for SLIP Self-supervision meets Language-Image Pre-training
MIT License
747 stars 69 forks source link

How use SLIP to predict the specific picture? #2

Closed lixiangMindSpore closed 2 years ago

lixiangMindSpore commented 2 years ago

How to use SLIP to predict the specific picture? Such as CLIP:

import torch import clip from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("D:\OD\CLIP\ViT-B-32.pt", device=device)

image = preprocess(Image.open("fuliqiang.png")).unsqueeze(0).to(device) text = clip.tokenize(["sleep", "play cellphone", "work"]).to(device)

with torch.no_grad(): image_features = model.encode_image(image) text_features = model.encode_text(text)

logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]

aiXander commented 2 years ago

Just spent a good hour creating this wrapper that allows working with legacy (OpenAI) CLIP and SLIP interchangeably, it's a bit hacky, but it works

import sys
from collections import OrderedDict

import torch 
import torch.nn as nn
from torchvision import transforms

from CLIP import clip

def normalize(img, input_range = None):
    if input_range is None:
        minv = img.min()
    else:
        minv = input_range[0]
    img = img - minv

    if input_range is None:
        maxv = img.max()
    else:
        maxv = input_range[1] - minv

    if maxv != 0:
        img = img / maxv

    return img

def adjust_range(img, out_range, input_range = None):
    img = normalize(img, input_range = input_range)
    img = img * (out_range[1] - out_range[0])
    img = img + out_range[0]
    return img

class CLIP_Base():
    # Default CLIP model from OpenAI
    def __init__(self, model):
        self.device = "cuda"
        self.model  = model.eval()

        self.preprocess_transform = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
            ])

    def preprocess(self, imgs, input_range = None):
        imgs = adjust_range(imgs, [0.,1.], input_range = input_range)
        return self.preprocess_transform(imgs)

    def encode_img(self, imgs, input_range = None, apply_preprocess = True):
        if apply_preprocess:
            imgs = self.preprocess(imgs, input_range = None)
        img_embeddings = self.model.encode_image(imgs)
        return img_embeddings / img_embeddings.norm(dim=-1, keepdim=True)

    def encode_text(self, texts):
        text_embeddings = torch.stack([self.model.encode_text(clip.tokenize(text).to(self.device)).detach().clone() for text in texts])
        return text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)

# TODO: this is very hacky, must fix this later
sys.path.append('/home/Desktop/GitHub_Projects/SLIP')
import models
from tokenizer import SimpleTokenizer
import utils

class SLIP_Base():
    def __init__(self, model_name):
        self.device = "cuda"

        if model_name == "SLIP_VITB16":
            ckpt_path  = "/home/GitHub_Projects/SLIP/pretrained_models/slip_base_100ep.pt"
        elif model_name == "SLIP_VITS16":
            ckpt_path = "/home/GitHub_Projects/SLIP/pretrained_models/slip_small_100ep.pt"

        self.preprocess_transform = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
            ])

        self.tokenizer = SimpleTokenizer()

        ckpt = torch.load(ckpt_path, map_location='cpu')
        state_dict = OrderedDict()
        for k, v in ckpt['state_dict'].items():
            state_dict[k.replace('module.', '')] = v

        # create model
        old_args = ckpt['args']
        old_args.model = model_name

        model = getattr(models, old_args.model)(rand_embed=False,
            ssl_mlp_dim=old_args.ssl_mlp_dim, ssl_emb_dim=old_args.ssl_emb_dim)
        model.cuda().requires_grad_(False).eval()
        model.load_state_dict(state_dict, strict=True)

        n_params = sum(p.numel() for p in model.parameters())
        print("Loaded perceptor %s: %.2fM params" %(model_name, (n_params/1000000)))

        self.model = utils.get_model(model)

    def preprocess(self, imgs, input_range = None):
        imgs = adjust_range(imgs, [0.,1.], input_range = input_range)
        return self.preprocess_transform(imgs)

    def encode_img(self, imgs, input_range = None, apply_preprocess = True):
        if apply_preprocess:
            imgs = self.preprocess(imgs, input_range = input_range)

        image_features = self.model.encode_image(imgs)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        return image_features

    def encode_text(self, texts):
        texts = self.tokenizer(texts).cuda(non_blocking=True)
        texts = texts.view(-1, 77).contiguous()
        text_embeddings = self.model.encode_text(texts)
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        return text_embeddings.unsqueeze(1)

def get_clip_perceptor(clip_model_name):
    if clip_model_name in ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']:
        perceptor, preprocess = clip.load(clip_model_name, jit=False, device = "cuda")
        perceptor = perceptor.requires_grad_(False).eval()

        n_params = sum(p.numel() for p in perceptor.parameters())
        print("Loaded CLIP %s: %.2fM params" %(clip_model_name, (n_params/1000000)))
        clip_perceptor = CLIP_Base(perceptor, preprocess)

    else:
        clip_perceptor = SLIP_Base(clip_model_name)

    return clip_perceptor
lixiangMindSpore commented 2 years ago

Just spent a good hour creating this wrapper that allows working with legacy (OpenAI) CLIP and SLIP interchangeably, it's a bit hacky, but it works

import sys
from collections import OrderedDict

import torch 
import torch.nn as nn
from torchvision import transforms

from CLIP import clip

def normalize(img, input_range = None):
    if input_range is None:
        minv = img.min()
    else:
        minv = input_range[0]
    img = img - minv

    if input_range is None:
        maxv = img.max()
    else:
        maxv = input_range[1] - minv

    if maxv != 0:
        img = img / maxv

    return img

def adjust_range(img, out_range, input_range = None):
    img = normalize(img, input_range = input_range)
    img = img * (out_range[1] - out_range[0])
    img = img + out_range[0]
    return img

class CLIP_Base():
    # Default CLIP model from OpenAI
    def __init__(self, model):
        self.device = "cuda"
        self.model  = model.eval()

        self.preprocess_transform = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
            ])

    def preprocess(self, imgs, input_range = None):
        imgs = adjust_range(imgs, [0.,1.], input_range = input_range)
        return self.preprocess_transform(imgs)

    def encode_img(self, imgs, input_range = None, apply_preprocess = True):
        if apply_preprocess:
            imgs = self.preprocess(imgs, input_range = None)
        img_embeddings = self.model.encode_image(imgs)
        return img_embeddings / img_embeddings.norm(dim=-1, keepdim=True)

    def encode_text(self, texts):
        text_embeddings = torch.stack([self.model.encode_text(clip.tokenize(text).to(self.device)).detach().clone() for text in texts])
        return text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)

# TODO: this is very hacky, must fix this later
sys.path.append('/home/Desktop/GitHub_Projects/SLIP')
import models
from tokenizer import SimpleTokenizer
import utils

class SLIP_Base():
    def __init__(self, model_name):
        self.device = "cuda"

        if model_name == "SLIP_VITB16":
            ckpt_path  = "/home/GitHub_Projects/SLIP/pretrained_models/slip_base_100ep.pt"
        elif model_name == "SLIP_VITS16":
            ckpt_path = "/home/GitHub_Projects/SLIP/pretrained_models/slip_small_100ep.pt"

        self.preprocess_transform = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
            ])

        self.tokenizer = SimpleTokenizer()

        ckpt = torch.load(ckpt_path, map_location='cpu')
        state_dict = OrderedDict()
        for k, v in ckpt['state_dict'].items():
            state_dict[k.replace('module.', '')] = v

        # create model
        old_args = ckpt['args']
        old_args.model = model_name

        model = getattr(models, old_args.model)(rand_embed=False,
            ssl_mlp_dim=old_args.ssl_mlp_dim, ssl_emb_dim=old_args.ssl_emb_dim)
        model.cuda().requires_grad_(False).eval()
        model.load_state_dict(state_dict, strict=True)

        n_params = sum(p.numel() for p in model.parameters())
        print("Loaded perceptor %s: %.2fM params" %(model_name, (n_params/1000000)))

        self.model = utils.get_model(model)

    def preprocess(self, imgs, input_range = None):
        imgs = adjust_range(imgs, [0.,1.], input_range = input_range)
        return self.preprocess_transform(imgs)

    def encode_img(self, imgs, input_range = None, apply_preprocess = True):
        if apply_preprocess:
            imgs = self.preprocess(imgs, input_range = input_range)

        image_features = self.model.encode_image(imgs)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        return image_features

    def encode_text(self, texts):
        texts = self.tokenizer(texts).cuda(non_blocking=True)
        texts = texts.view(-1, 77).contiguous()
        text_embeddings = self.model.encode_text(texts)
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        return text_embeddings.unsqueeze(1)

def get_clip_perceptor(clip_model_name):
    if clip_model_name in ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']:
        perceptor, preprocess = clip.load(clip_model_name, jit=False, device = "cuda")
        perceptor = perceptor.requires_grad_(False).eval()

        n_params = sum(p.numel() for p in perceptor.parameters())
        print("Loaded CLIP %s: %.2fM params" %(clip_model_name, (n_params/1000000)))
        clip_perceptor = CLIP_Base(perceptor, preprocess)

    else:
        clip_perceptor = SLIP_Base(clip_model_name)

    return clip_perceptor

Thank you so much. In addition, for class CLIP_Base() and class SLIP_Base(), to adapt clip_perceptor = CLIP_Base(perceptor, preprocess) or SLIP_Base(clip_model_name) do I need to add forward()?In this condition, how to write?

lucky630 commented 2 years ago

Thank you so much. In addition, for class CLIP_Base() and class SLIP_Base(), to adapt clip_perceptor = CLIP_Base(perceptor, preprocess) or SLIP_Base(clip_model_name) do I need to add forward()?In this condition, how to write?

you don't have to add the forward() method. you can do something like this.

import os
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SLIP_Base('SLIP_VITB16')

preprocess = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
            ])
# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)

def get_features(dataset):
    all_features = []
    all_labels = []

    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_img(images.to(device),apply_preprocess=False)

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")