Closed seniusen closed 2 years ago
import json import os
import numpy as np
import clip import torch
COCO_CLASSES = [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'] print(len(COCO_CLASSES))
device = "cpu" model, preprocess = clip.load('ViT-B/32', device)
text_inputs = torch.cat([clip.tokenize(f"a photo of {o}") for o in COCO_CLASSES]).to(device)
with torch.no_grad(): text_features = model.encode_text(text_inputs) text_features = text_features.data.cpu().numpy() path = 'hico_hoi_clip.npy' np.save(path, text_features)
“The word embeddings are extracted by the CLIP model [54] and their dimension is 512. ” I'm confused about the word embeddings used in the paper. Could you give me more details about how to generate the word embeddings, such as which CLIP model is used, and how to set the input to the CLIP model according to the object category? Thank you!