Regarding embedding files for cub.

BakingBrains commented 2 years ago

Can you please suggest is this the right way to generate the embeddings?

from transformers import AutoTokenizer, AutoModel
import pickle

text_file = "file.txt"

with open(text_file, 'r') as rr:
    data = rr.read()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

inputs = tokenizer(data, return_tensors="pt")
outputs = model(**inputs)
print(outputs[0])

embeddings = {'embedding_words': outputs[0].detach().numpy()}

with open('filepickle', 'wb') as pkl:
    pickle.dump(embeddings, pkl)

Thank You

dqshuai commented 2 years ago

this is my way to generate the embeddings.

with open(os.path.join(os.path.join(root,'CUB_200_2011'),'images.txt'),'r') as f:
    for line in f:
        image_id,file_name = line.split()
        text_file = file_name.replace('.jpg','.txt')
        text_file = text_root + text_file
        text_list = []
        with open(text_file,'r') as f_text:
            for line in f_text:
                line = line.encode(encoding='UTF-8',errors='strict')
                line = line.replace(b'\xef\xbf\xbd\xef\xbf\xbd',b' ')
                line = line.decode('UTF-8','strict')
                text_list.append(line) 
        inputs = tokenizer(text_list, return_tensors="pt",padding="max_length",truncation=True, max_length=32)
        outputs = model(**inputs)
        embedding_mean = outputs[1].mean(dim=0).reshape(1,-1).detach().numpy()
        embedding_full = outputs[1].detach().numpy()
        embedding_words = outputs[0].detach().numpy()
        data_dict = {
            'embedding_mean':embedding_mean,
            'embedding_full':embedding_full,
            'embedding_words':embedding_words,
        }
        class_name,image_name = file_name.split('/')
        if not os.path.exists(os.path.join(embedding_root,class_name)):
            os.makedirs(os.path.join(embedding_root,class_name))
        embedding_file_path = os.path.join(os.path.join(embedding_root,class_name),image_name.replace('.jpg','.pickle'))
        with open(embedding_file_path,'wb') as f_write:
            pickle.dump(data_dict,f_write)

Moreover,bert_embedding is embeding of cub which is generated. Hope it helps you!

BakingBrains commented 2 years ago

Thanks a lot @dqshuai. Awesome work. This solved my doubt.

Thank You

dqshuai / MetaFormer

Regarding embedding files for cub. #8