VikParuchuri / marker

Convert PDF to markdown quickly with high accuracy
https://www.datalab.to
GNU General Public License v3.0
14.15k stars 720 forks source link

Keyerror During run_ocr using surya #163

Closed Aldo-Aditiya closed 1 month ago

Aldo-Aditiya commented 1 month ago

Running the convert_single_pdf function, I got the below error:

File ...lib/python3.9/site-packages/marker/convert.py:90, in convert_single_pdf(fname, model_lst, max_pages, start_page, metadata, langs, batch_multiplier)
     87 flush_cuda_memory()
     89 # OCR pages as needed
---> 90 pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
     91 flush_cuda_memory()
     93 out_meta["ocr_stats"] = ocr_stats

File ...lib/python3.9/site-packages/marker/ocr/recognition.py:51, in run_ocr(doc, pages, langs, rec_model, batch_multiplier)
     49     return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
     50 elif ocr_method == "surya":
---> 51     new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
     52 elif ocr_method == "ocrmypdf":
     53     new_pages = tesseract_recognition(doc, ocr_idxs, langs)

File ...lib/python3.9/site-packages/marker/ocr/recognition.py:76, in surya_recognition(doc, page_idxs, langs, rec_model, pages, batch_multiplier)
     73 detection_results = [p.text_lines.bboxes for p in selected_pages]
     74 polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
---> 76 results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
     78 new_pages = []
     79 for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):

File ...lib/python3.9/site-packages/surya/ocr.py:30, in run_recognition(images, langs, rec_model, rec_processor, bboxes, polygons, batch_size)
     27     all_slices.extend(slices)
     28     all_langs.extend([lang] * len(slices))
---> 30 rec_predictions, _ = batch_recognition(all_slices, all_langs, rec_model, rec_processor, batch_size=batch_size)
     32 predictions_by_image = []
     33 slice_start = 0

File ...lib/python3.9/site-packages/surya/recognition.py:138, in batch_recognition(images, languages, model, processor, batch_size)
    136 while token_count < settings.RECOGNITION_MAX_TOKENS:
    137     is_prefill = token_count == 0
--> 138     return_dict = model(
    139         decoder_input_ids=batch_decoder_input,
    140         decoder_attention_mask=attention_mask,
    141         decoder_self_kv_cache=None if is_prefill else decoder_cache,
    142         decoder_cross_kv_cache=None if is_prefill else encoder_cache,
    143         decoder_past_token_count=token_count,
    144         decoder_langs=batch_langs,
    145         pixel_values=batch_pixel_values,
    146         encoder_outputs=encoder_outputs,
    147         return_dict=True,
    148     )
    150     logits = return_dict["logits"][:current_batch_size] # Ignore batch padding
    151     preds = torch.argmax(logits[:, -1], dim=-1)

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ...lib/python3.9/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py:615, in VisionEncoderDecoderModel.forward(self, pixel_values, decoder_input_ids, decoder_attention_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)
    610     decoder_input_ids = shift_tokens_right(
    611         labels, self.config.pad_token_id, self.config.decoder_start_token_id
    612     )
    614 # Decode
--> 615 decoder_outputs = self.decoder(
    616     input_ids=decoder_input_ids,
    617     attention_mask=decoder_attention_mask,
    618     encoder_hidden_states=encoder_hidden_states,
    619     encoder_attention_mask=encoder_attention_mask,
    620     inputs_embeds=decoder_inputs_embeds,
    621     output_attentions=output_attentions,
    622     output_hidden_states=output_hidden_states,
    623     use_cache=use_cache,
    624     past_key_values=past_key_values,
    625     return_dict=return_dict,
    626     **kwargs_decoder,
    627 )
    629 # Compute loss independent from decoder (as some shift the logits inside them)
    630 loss = None

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ...lib/python3.9/site-packages/surya/model/recognition/decoder.py:474, in MBartMoE.forward(self, input_ids, attention_mask, self_kv_cache, cross_kv_cache, past_token_count, langs, encoder_hidden_states, encoder_attention_mask, head_mask, cross_attn_head_mask, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
    471 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    473 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--> 474 outputs = self.model.decoder(
    475     input_ids=input_ids,
    476     attention_mask=attention_mask,
    477     self_kv_cache=self_kv_cache,
    478     cross_kv_cache=cross_kv_cache,
    479     past_token_count=past_token_count,
    480     langs=langs,
    481     encoder_hidden_states=encoder_hidden_states,
    482 )
    484 logits = self.lm_head(outputs[0])
    486 if not return_dict:

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ...lib/python3.9/site-packages/surya/model/recognition/decoder.py:387, in MBartMoEDecoder.forward(self, input_ids, attention_mask, self_kv_cache, cross_kv_cache, past_token_count, langs, encoder_hidden_states)
    385 layer_self_kv_cache = self_kv_cache[idx] if self_kv_cache is not None else None
    386 layer_cross_kv_cache = cross_kv_cache[idx] if cross_kv_cache is not None else None
--> 387 layer_outputs = decoder_layer(
    388     hidden_states,
    389     attention_mask=attention_mask,
    390     langs=langs,
    391     self_kv_cache=layer_self_kv_cache,
    392     cross_kv_cache=layer_cross_kv_cache,
    393     is_prefill=is_prefill,
    394     encoder_hidden_states=encoder_hidden_states,
    395     encoder_attention_mask=None,
    396     use_cache=use_cache,
    397 )
    398 hidden_states = layer_outputs[0]
    400 if use_cache:

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ...lib/python3.9/site-packages/surya/model/recognition/decoder.py:308, in MBartMoEDecoderLayer.forward(self, hidden_states, attention_mask, langs, self_kv_cache, cross_kv_cache, is_prefill, encoder_hidden_states, encoder_attention_mask, use_cache)
    306 hidden_states = self.final_layer_norm(hidden_states)
    307 if self.has_moe:
--> 308     hidden_states = self.moe(hidden_states, langs)
    309 else:
    310     hidden_states = self.activation_fn(self.fc1(hidden_states))

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ...lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ...lib/python3.9/site-packages/surya/model/recognition/decoder.py:99, in MBartExpertLayer.forward(self, hidden_states, langs)
     96 if idx.shape[0] == 0:
     97     continue
---> 99 expert_layer = self.experts[str(expert_lang.item())]
    101 current_state = hidden_states[idx]
    102 current_hidden_states = expert_layer(current_state.view(-1, hidden_dim))

File ...lib/python3.9/site-packages/torch/nn/modules/container.py:461, in ModuleDict.__getitem__(self, key)
    459 @_copy_to_script_wrapper
    460 def __getitem__(self, key: str) -> Module:
--> 461     return self._modules[key]

KeyError: '65555'
Aldo-Aditiya commented 1 month ago

Closing this, and moving the issue to surya repo. https://github.com/VikParuchuri/surya/issues/128