RunTimeError when creating spatial position embeddings

@NielsRogge, Hi Niels, the issue I am facing when running layoutlmv3 model on FunsdDataset return forward_call(*args, **kwargs) File "/Users/im/Documents/projects_dl/ocr_layoutlm/myenv1/lib/python3.9/site-packages/transformers-4.32.0.dev0-py3.9.egg/transformers/models/layoutlmv3/modeling_layoutlmv3.py", line 345, in forward embeddings = embeddings + spatial_position_embeddings RuntimeError: The size of tensor a (512) must match the size of tensor b (4) at non-singleton dimension 2

I have created custom dataset for funds annotations and images as below

class CustomFunsdDataSet(Dataset): def init(self, filepath, processor): self.words = [] self.bboxes = [] self.images = [] self.word_labels = []

    self.target_labels = ["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
    self.id2label = {k: v for k, v in enumerate(self.target_labels)}
    self.label2id = {v: k for k, v in enumerate(self.target_labels)}
    print(self.label2id)

    for _, example in Funsd().generate_examples(filepath):
        self.words.append(example["tokens"])
        self.bboxes.append(example["bboxes"])
        self.images.append(example["image"])
        self.word_labels.append([self.label2id[tag] for tag in example["ner_tags"]])

    self.encoding = processor(images=self.images, text=self.words, boxes=self.bboxes, word_labels=self.word_labels,
                              max_length=512,
                              padding="max_length",
                              truncation="longest_first", return_tensors='pt')
    print(f"preprocessor result:\n {type(self.encoding)}, {self.encoding.keys()}")

def __len__(self):
    return len(self.images)

def __getitem__(self, index) -> dict:
    return {
        "input_ids": torch.tensor(self.encoding["input_ids"][index], dtype=torch.int64),
        "attention_mask": torch.tensor(self.encoding["attention_mask"][index], dtype=torch.int64),
        "bbox": torch.tensor(self.encoding["bbox"], dtype=torch.int64),
        "pixel_values": torch.tensor(self.encoding['pixel_values'], dtype=torch.float32),
        "labels": torch.tensor(self.encoding['labels'], dtype=torch.int64)
    }

train_dataset = CustomFunsdDataSet(os.path.join(path, train), processor) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=2)

Processor defined as:

feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False) tokenizer = LayoutLMv3TokenizerFast.from_pretrained( "/Users/madhavim/Documents/projects_dl/ocr_layoutlm/src_layoutlmv3/model_layoutlmv3", ignore_mismatched_sizes=True)

processor = LayoutLMv3Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)

=== Upon training the Layoutlmv3Tokenforclassification model , runtime error of size mismatch is throwing when creating spatial position embedding.

File "/Users/im/Documents/projects_dl/ocr_layoutlm/myenv1/lib/python3.9/site-packages/transformers-4.32.0.dev0-py3.9.egg/transformers/models/layoutlmv3/modeling_layoutlmv3.py", line 345, in forward embeddings = embeddings + spatial_position_embeddings RuntimeError: The size of tensor a (512) must match the size of tensor b (4) at non-singleton dimension 2

I am not getting how to resolve this issue. Looking out for your help Thanks,

NielsRogge / Transformers-Tutorials

RunTimeError when creating spatial position embeddings #340