mbzuai-oryx / groundingLMM

[CVPR 2024 🔥] Grounding Large Multimodal Model (GLaMM), the first-of-its-kind model capable of generating natural language responses that are seamlessly integrated with object segmentation masks.
https://grounding-anything.com
747 stars 37 forks source link

Confusing referring segmentation results. #63

Open ZhimaoPeng opened 2 months ago

ZhimaoPeng commented 2 months ago

Hi,

I tried to use the modified app.py to work with my own data, but got strange segmentation results:

mask n13052670_94

I don't know what went wrong and I hope to get your help. Here is the modified code:

def inference(input_str, all_inputs, follow_up, generate):

bbox_img = all_inputs['boxes']

# input_image = all_inputs['image']
input_image = all_inputs

print("input_str: ", input_str, "input_image: ", input_image)

if generate:
    return generate_new_image(st_pipe, input_str, input_image)

if not follow_up:
    conv = conversation_lib.conv_templates[args.conv_type].copy()
    conv.messages = []
    conv_history = {'user': [], 'model': []}
    conv_history["user"].append(input_str)

# input_str = input_str.replace('&lt;', '<').replace('&gt;', '>')
prompt = input_str
prompt = f"The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture." + "\n" + prompt
if args.use_mm_start_end:
    replace_token = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN)
    prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)

if not follow_up:
    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], "")
else:
    conv.append_message(conv.roles[0], input_str)
    conv.append_message(conv.roles[1], "")
prompt = conv.get_prompt()

image_np = cv2.imread(input_image)
image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
orig_h, orig_w = image_np.shape[:2]
original_size_list = [image_np.shape[:2]]

# Prepare input for Global Image Encoder
global_enc_image = global_enc_processor.preprocess(
    image_np, return_tensors="pt")["pixel_values"][0].unsqueeze(0).cuda()
global_enc_image = global_enc_image.bfloat16()

# Prepare input for Grounding Image Encoder
image = transform.apply_image(image_np)
resize_list = [image.shape[:2]]
grounding_enc_image = (grounding_enc_processor(torch.from_numpy(image).permute(2, 0, 1).
                                               contiguous()).unsqueeze(0).cuda())
grounding_enc_image = grounding_enc_image.bfloat16()

# Prepare input for Region Image Encoder
post_h, post_w = global_enc_image.shape[1:3]
bboxes = None
# if len(bbox_img) > 0:
#     bboxes = region_enc_processor((orig_h, orig_w), (post_h, post_w), bbox_img)

input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
input_ids = input_ids.unsqueeze(0).cuda()

# Pass prepared inputs to model
output_ids, pred_masks = model.evaluate(
    global_enc_image, grounding_enc_image, input_ids, resize_list, original_size_list, max_tokens_new=512,
    bboxes=bboxes)
output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]

text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
text_output = text_output.replace("\n", "").replace("  ", " ")
text_output = text_output.split("ASSISTANT: ")[-1]
print("text_output: ", text_output)

# For multi-turn conversation
conv.messages.pop()
conv.append_message(conv.roles[1], text_output)
conv_history["model"].append(text_output)
color_history = []
save_img = None
if "[SEG]" in text_output:
    save_img = prepare_mask(input_image, image_np, pred_masks, text_output, color_history)

output_str = text_output  # input_str
if save_img is not None:
    output_image = save_img  # input_image
else:
    if len(bbox_img) > 0:
        output_image = draw_bbox(image_np.copy(), bbox_img)
    else:
        output_image = input_image

markdown_out = process_markdown(output_str, color_history)

return output_image, markdown_out

if name == "main": args = parse_args(sys.argv[1:]) tokenizer = setup_tokenizer_and_special_tokens(args) model = initialize_model(args, tokenizer) model = prepare_model_for_inference(model, args) global_enc_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower) transform = ResizeLongestSide(args.image_size) model.eval()

# st_pipe = AutoPipelineForInpainting.from_pretrained(
#     "diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16, variant="fp16"
# ).to("cuda")

conv = None
# Only to Display output
conv_history = {'user': [], 'model': []}
mask_path = None
input_str = 'Please segment mushromm in this image.'
all_inputs = 'n13052670_94.JPEG'
generate = False
follow_up = False
output_image, markdown_out = inference(input_str, all_inputs, follow_up, generate)
sipie800 commented 2 months ago

Same here. Seems to be a toy model. You can use florence2 or paligemma. They are much better with similar functions. Or go grounding dino + SAM , they are old but also can handle something.