mbzuai-oryx / groundingLMM

[CVPR 2024 🔥] Grounding Large Multimodal Model (GLaMM), the first-of-its-kind model capable of generating natural language responses that are seamlessly integrated with object segmentation masks.
https://grounding-anything.com
740 stars 37 forks source link

Why is it that during the computation of segmentation results, the model() function is used instead of model.generate()? Wouldn't this mean that when predicting the next token, the information viewed is from the actual token rather than the predicted one? #67

Open L1uShuai opened 1 month ago

L1uShuai commented 1 month ago

When computing the segmentation results, the model() function is employed. def evaluate_model_performance(validation_loader, model, args):

Trackers for metrics

trackers = {
    "intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM),
    "union": AverageMeter("Union", ":6.3f", Summary.SUM),
    "gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM)
}

model.eval()
for data_batch in tqdm.tqdm(validation_loader):
    # Prepare data and convert relevant tensors to the appropriate type
    data_batch = dict_to_cuda(data_batch)
    for key in ["global_enc_images", "grounding_enc_images"]:
        data_batch[key] = data_batch[key].to(dtype=torch.bfloat16, device=args.local_rank)

    torch.cuda.empty_cache()

    # Model inference without gradient tracking
    with torch.no_grad():
        results = model(**data_batch)

    predictions = results["pred_masks"]
    gt_masks = results["gt_masks"][0].int()
    predicted_masks = (predictions[0] > 0).int()  # Thresholding to get binary masks
    assert len(predictions) == 1

    intersection, union, accuracy_iou = 0.0, 0.0, 0.0
    for target, prediction in zip(gt_masks, predicted_masks):
        intersect, union_, _ = intersectionAndUnionGPU(
            prediction.contiguous().clone(), target.contiguous(), 2, ignore_index=255
        )
        intersection += intersect
        union += union_
        accuracy_iou += intersect / (union_ + 1e-5)
        # handles no-object targets
        accuracy_iou[union_ == 0] += 1.0

    intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
    accuracy_iou = accuracy_iou.cpu().numpy() / gt_masks.shape[0]
    trackers["intersection"].update(intersection)
    trackers["union"].update(union)
    trackers["gIoU"].update(accuracy_iou, n=gt_masks.shape[0])

for meter in trackers.values():
    meter.all_reduce()

iou_per_class = trackers["intersection"].sum / (trackers["union"].sum + 1e-10)
class_iou = iou_per_class[1]
global_iou = trackers["gIoU"].avg[1]

return global_iou, class_iou

When computing the caption results, model.generate() is utilized.

def inference(instructions, inputs):

Extract the inputs

bbox_img = inputs['boxes']
image_path = inputs['image']

instructions = instructions.replace('&lt;', '<').replace('&gt;', '>')

# Prepare prompt for model Inference
conv = conversation_lib.conv_templates[args.conv_type].copy()
conv.messages = []
begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
prompt = begin_str + instructions
if args.use_mm_start_end:
    replace_token = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN)
    prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], "")
prompt = conv.get_prompt()

# Read and preprocess the image (Global image encoder - CLIP)
image_np = cv2.imread(image_path)
image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
original_size_list = [image_np.shape[:2]]
image_clip = (clip_image_processor.preprocess(image_np, return_tensors="pt")["pixel_values"][0].unsqueeze(0).cuda())
image_clip = image_clip.bfloat16()  # Precision is bf16 by default

# Preprocess the image (Grounding image encoder)
image = transform.apply_image(image_np)
resize_list = [image.shape[:2]]
image = (
    grounding_image_ecoder_preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous()).unsqueeze(0).cuda())
image = image.bfloat16()  # Precision is bf16 by default

# Prepare inputs for inference
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
input_ids = input_ids.unsqueeze(0).cuda()
bboxes = None
if len(bbox_img) > 0:
    height, width = original_size_list[0]  # Original Image Dimensions

    # Rescaling BBox to 336*336
    x_scale, y_scale = 336 / width, 336 / height
    bboxes_scaled = [[bbox[0] * x_scale, bbox[1] * y_scale,
                      bbox[2] * x_scale, bbox[3] * y_scale] for bbox in bbox_img]
    ori_bboxes = np.array(bboxes_scaled, dtype=np.float64)
    height_sc, width_sc = (336, 336)  # To normalize the Image
    norm_bboxes = ori_bboxes / np.array([width_sc, height_sc, width_sc, height_sc])
    bboxes = [torch.tensor(norm_bboxes).cuda().half().to(torch.bfloat16)]

# Generate output
**output_ids, pred_masks = model.evaluate(image_clip, image, input_ids, resize_list, original_size_list,
                                        max_tokens_new=512, bboxes=bboxes)**
output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]

# Post-processing
text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
text_output = text_output.replace("\n", "").replace("  ", " ")
text_output = text_output.split("ASSISTANT: ")[-1]

cleaned_str = re.sub(r'<.*?>', '', text_output)

# Remove the [SEG] token
cleaned_str = cleaned_str.replace('[SEG]', '')

# Strip unnecessary spaces
cleaned_str = ' '.join(cleaned_str.split()).strip("'")
cleaned_str = cleaned_str.strip()

return cleaned_str
yruns commented 1 week ago

same question