Open lovehuanhuan opened 1 week ago
result is as below:
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
/usr/local/lib/python3.10/dist-packages/timm/models/layers/init.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
warnings.warn(f"Importing from {name} is deprecated, please import via timm.layers", FutureWarning)
/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or None
for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing weights=None
.
warnings.warn(msg)
number of params: 143936517
/mnt/SegVG/run_example.py:230: FutureWarning: You are using torch.load
with weights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only
will be flipped to True
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals
. We recommend you start setting weights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
checkpoint = torch.load(model_path, map_location='cpu') # 加载预训练的权重
Missing keys
[]
Unexpected keys
['textmodel.embeddings.position_ids']
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 1037, 2450, 2007, 2146, 2606, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
shape
tensor([[ 1.4612, 1.4954, 1.5297, ..., 1.3755, 1.3927, 1.4783],
[ 1.4612, 1.4954, 1.5297, ..., 1.3755, 1.3927, 1.4783],
[ 1.4612, 1.5125, 1.5297, ..., 1.3755, 1.3927, 1.4783],
...,
[-1.6384, -1.6727, -1.6555, ..., 0.2967, 0.3481, 0.5193],
[-1.6384, -1.6384, -1.6042, ..., 0.2796, 0.3309, 0.4851],
[-1.6042, -1.6042, -1.5528, ..., 0.2796, 0.3309, 0.4851]],
device='cuda:0')
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:1086: FutureWarning: The device
argument is deprecated and will be removed in v5 of Transformers.
warnings.warn(
Predicted bounding box: [[[nan nan nan nan]]
[[nan nan nan nan]]
[[nan nan nan nan]]
[[nan nan nan nan]]
[[nan nan nan nan]]
[[nan nan nan nan]]]
With these code, i got [none,none,none,none] box output
import torch from PIL import Image import os import torch.utils.data as data from torchvision import transforms import matplotlib.pyplot as plt from datasets import SegVGDataset from datasets import InputExample from datasets import convert_examples_to_features from models import build_model import argparse from utils.misc import * from pytorch_pretrained_bert.tokenization import BertTokenizer
from datasets import data_loader
def get_args_parser(): parser = argparse.ArgumentParser('Set transformer detector', add_help=False) parser.add_argument('--lr', default=1e-4, type=float) parser.add_argument('--lr_bert', default=0., type=float) parser.add_argument('--lr_visu_cnn', default=0., type=float) parser.add_argument('--lr_visu_tra', default=1e-5, type=float) parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--weight_decay', default=1e-4, type=float) parser.add_argument('--epochs', default=100, type=int) parser.add_argument('--lr_power', default=0.9, type=float, help='lr poly power') parser.add_argument('--clip_max_norm', default=0., type=float, help='gradient clipping max norm') parser.add_argument('--eval', dest='eval', default=False, action='store_true', help='if evaluation only') parser.add_argument('--optimizer', default='rmsprop', type=str) parser.add_argument('--lr_scheduler', default='poly', type=str) parser.add_argument('--lr_drop', default=80, type=int)
def load_image(image_path, imsize=640): """ 加载并预处理单张图片。 """ image = Image.open(image_path).convert("RGB")
def load_text_data(text_input, tokenizer, max_query_len=64):
"""
将文本转化为模型输入格式。
"""
tokens = tokenizer.tokenize(text_input)
tokens = ["[CLS]"] + tokens[:max_query_len - 2] + ["[SEP]"] # 添加特殊标记
token_ids = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = [1] * len(token_ids)
填充到固定长度
padding_length = max_query_len - len(token_ids)
token_ids += [0] * padding_length
attention_mask += [0] * padding_length
转换为张量
input_ids = torch.tensor([token_ids], dtype=torch.long)
attention_mask = torch.tensor([attention_mask], dtype=torch.long)
return input_ids, attention_mask
def prepare_nested_tensor(image, text_input, tokenizer, max_query_len=64, device='cuda'): """ 将图像和文本数据联合封装为 NestedTensor,并移动到目标设备。 """
处理文本数据
def read_examples(text_input, unique_id=None): """ 处理自定义文本输入,创建 InputExample 对象。 如果没有提供 unique_id,则使用文本的哈希值作为唯一标识符。 """ examples = [] text_a = text_input
如果没有提供 unique_id,则使用文本的哈希值
示例使用,传入文本并自动生成 unique_id
text_input = 'the teacher near the blackboard'
examples = read_examples(text_input)
def main(model_path, image_path, text_input, args): """ 加载模型并进行推理。 """
加载模型
if name == 'main': model_path = '/mnt/ckpt/gref/best_checkpoint.pth' # 替换为实际的模型路径 image_path = './frame_120.png' # 替换为你要推理的图像路径 text_input = 'a woman with long hair' # 用来描述图像的文本