IDEA-Research / GroundingDINO

[ECCV 2024] Official implementation of the paper "Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection"
https://arxiv.org/abs/2303.05499
Apache License 2.0
5.72k stars 603 forks source link

RuntimeError: Unsupported TypeMeta in ATen: (please report this error) #326

Open kshitizagarwal-dev opened 2 months ago

kshitizagarwal-dev commented 2 months ago

I am loading the image and resizing it to 512x 512 using the code :

Load image

from PIL import Image

Open the image file

def load_images(path,ext): image = Image.open(path)

subfolders = ["pngImages", "jpgImages"]
main_folder = "/content/Grounded-Segment-Anything/images/"

os.makedirs(main_folder)
os.makedirs(main_folder+subfolders[0])
os.makedirs(main_folder+subfolders[1])
resized_image = image.resize((512, 512))
if ext == 'jpg' :
  resized_image.save("/content/Grounded-Segment-Anything/images/jpgImages/resized_image.jpg")  # Save the resized image to a file
  local_image_path = "/content/Grounded-Segment-Anything/images/jpgImages/resized_image.jpg"
else:
  resized_image.save("/content/Grounded-Segment-Anything/images/pngImages/resized_image.png")  # Save the resized image to a file
  local_image_path = "/content/Grounded-Segment-Anything/images/pngImages/resized_image.png"

image_source, image = load_image(local_image_path)
return image_source, image

Image.fromarray(image_source)

image_source, image = load_images('/content/drive/MyDrive/rust/12.jpg','jpg')

/content/drive/MyDrive/rust/Heavy rust 5.png

Image.fromarray(image_source)

def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'): cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)

args = SLConfig.fromfile(cache_config_file)
args.device = device
model = build_model(args)

cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
checkpoint = torch.load(cache_file, map_location=device)
log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
print("Model loaded from {} \n => {}".format(cache_file, log))
_ = model.eval()
import os
print(os.getcwd())
return model

ckpt_repo_id = "ShilongLiu/GroundingDINO" ckpt_filenmae = "groundingdino_swinb_cogcoor.pth" ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"

groundingdino_model = load_model_hf(ckpt_repo_id, ckpt_filenmae, ckpt_config_filename, device)

detect object using grounding DINO

def detect(image, text_prompt, model, box_threshold = 0.3, text_threshold = 0.25):

boxes, logits, phrases = predict( model=model, image=image, caption=text_prompt, box_threshold=box_threshold, text_threshold=text_threshold )

print(phrases) annotated_frame,xyxy = annotate1(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases) annotated_frame = annotated_frame[...,::-1] # BGR to RGB return annotated_frame, boxes, logits,xyxy

annotated_frame, detected_boxes, detected_logits,xyxy= detect(image, text_prompt="rust of all type", model=groundingdino_model) Image.fromarray(annotated_frame)

Error trace:

FutureWarning: The device argument is deprecated and will be removed in v5 of Transformers. UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. UserWarning: None of the inputs have requires_grad=True. Gradients will be None

RuntimeError Traceback (most recent call last) in <cell line: 1>() ----> 1 annotated_frame, detected_boxes, detected_logits,xyxy= detect(image, text_prompt="rust of all type", model=groundingdino_model) 2 Image.fromarray(annotated_frame)

24 frames in detect(image, text_prompt, model, box_threshold, text_threshold) 2 def detect(image, text_prompt, model, box_threshold = 0.3, text_threshold = 0.25): 3 ----> 4 boxes, logits, phrases = predict( 5 model=model, 6 image=image,

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/util/inference.py in predict(model, image, caption, box_threshold, text_threshold, device) 65 66 with torch.no_grad(): ---> 67 outputs = model(image[None], captions=[caption]) 68 69 prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0] # prediction_logits.shape = (nq, 256)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, kwargs) 1509 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1510 else: -> 1511 return self._call_impl(args, kwargs) 1512 1513 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, *kwargs) 1518 or _global_backward_pre_hooks or _global_backward_hooks 1519 or _global_forward_hooks or _global_forward_pre_hooks): -> 1520 return forward_call(args, **kwargs) 1521 1522 try:

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py in forward(self, samples, targets, **kw) 311 312 input_query_bbox = input_query_label = attn_mask = dn_meta = None --> 313 hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer( 314 srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict 315 )

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, kwargs) 1509 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1510 else: -> 1511 return self._call_impl(args, kwargs) 1512 1513 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, *kwargs) 1518 or _global_backward_pre_hooks or _global_backward_hooks 1519 or _global_forward_hooks or _global_forward_pre_hooks): -> 1520 return forward_call(args, **kwargs) 1521 1522 try:

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/transformer.py in forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask, text_dict) 256 # Begin Encoder 257 ######################################################### --> 258 memory, memory_text = self.encoder( 259 src_flatten, 260 pos=lvl_pos_embed_flatten,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, kwargs) 1509 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1510 else: -> 1511 return self._call_impl(args, kwargs) 1512 1513 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, *kwargs) 1518 or _global_backward_pre_hooks or _global_backward_hooks 1519 or _global_forward_hooks or _global_forward_pre_hooks): -> 1520 return forward_call(args, **kwargs) 1521 1522 try:

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/transformer.py in forward(self, src, pos, spatial_shapes, level_start_index, valid_ratios, key_padding_mask, memory_text, text_attention_mask, pos_text, text_self_attention_masks, position_ids) 575 # main process 576 if self.use_transformer_ckpt: --> 577 output = checkpoint.checkpoint( 578 layer, 579 output,

/usr/local/lib/python3.10/dist-packages/torch/_compile.py in inner(*args, *kwargs) 22 import torch._dynamo 23 ---> 24 return torch._dynamo.disable(fn, recursive)(args, **kwargs) 25 26 return inner

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in _fn(*args, *kwargs) 487 dynamo_config_ctx.enter() 488 try: --> 489 return fn(args, **kwargs) 490 finally: 491 set_eval_frame(prior)

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/external_utils.py in inner(*args, kwargs) 15 @functools.wraps(fn) 16 def inner(*args, *kwargs): ---> 17 return fn(args, kwargs) 18 19 return inner

/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py in checkpoint(function, use_reentrant, context_fn, determinism_check, debug, *args, *kwargs) 480 "use_reentrant=False." 481 ) --> 482 return CheckpointFunction.apply(function, preserve, args) 483 else: 484 gen = _checkpoint_without_reentrant_generator(

/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py in apply(cls, *args, *kwargs) 551 # See NOTE: [functorch vjp and autograd interaction] 552 args = _functorch.utils.unwrap_dead_wrappers(args) --> 553 return super().apply(args, **kwargs) # type: ignore[misc] 554 555 if not is_setup_ctx_defined:

/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py in forward(ctx, run_function, preserve_rng_state, args) 259 260 with torch.no_grad(): --> 261 outputs = run_function(args) 262 return outputs 263

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, kwargs) 1509 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1510 else: -> 1511 return self._call_impl(args, kwargs) 1512 1513 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, *kwargs) 1518 or _global_backward_pre_hooks or _global_backward_hooks 1519 or _global_forward_hooks or _global_forward_pre_hooks): -> 1520 return forward_call(args, **kwargs) 1521 1522 try:

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/transformer.py in forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask) 784 # self attention 785 # import ipdb; ipdb.set_trace() --> 786 src2 = self.self_attn( 787 query=self.with_pos_embed(src, pos), 788 reference_points=reference_points,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, kwargs) 1509 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1510 else: -> 1511 return self._call_impl(args, kwargs) 1512 1513 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, *kwargs) 1518 or _global_backward_pre_hooks or _global_backward_hooks 1519 or _global_forward_hooks or _global_forward_pre_hooks): -> 1520 return forward_call(args, **kwargs) 1521 1522 try:

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/ms_deform_attn.py in forward(self, query, key, value, query_pos, key_padding_mask, reference_points, spatial_shapes, level_start_index, **kwargs) 336 attention_weights = attention_weights.float() 337 --> 338 output = MultiScaleDeformableAttnFunction.apply( 339 value, 340 spatial_shapes,

/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py in apply(cls, *args, *kwargs) 551 # See NOTE: [functorch vjp and autograd interaction] 552 args = _functorch.utils.unwrap_dead_wrappers(args) --> 553 return super().apply(args, **kwargs) # type: ignore[misc] 554 555 if not is_setup_ctx_defined:

/content/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/ms_deform_attn.py in forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step) 51 ): 52 ctx.im2col_step = im2col_step ---> 53 output = _C.ms_deform_attn_forward( 54 value, 55 value_spatial_shapes,

RuntimeError: Unsupported TypeMeta in ATen: (please report this error)

SubicLovePython commented 2 months ago

me too

ziyuehcen0 commented 1 month ago

sama here.

bingwork commented 3 weeks ago

me too