Closed ta4tsering closed 1 year ago
can you give an example of a box in the json file that has a negative width or height? that's very strange... my initial reaction is to think that this might be a side effect of the space insertion but I might be wrong
this input gives a -1 width for the last box in bboxes: 06150003.json.gz
this bug arises in get_avg_bbox_height(self, bboxes) method of the OCRFormatter class, line 123 of ocr.py error as divided by zero
can you copy-paste the json of the box as a comment in this PR please?
{
"textAnnotations": [
{
"locale": "und",
"description": "\u0f53\n\u0f0b\n\u0f42 \u0f42",
"boundingPoly": {
"vertices": [
{
"x": 816,
"y": 388
},
{
"x": 3635,
"y": 388
},
{
"x": 3635,
"y": 672
},
{
"x": 816,
"y": 672
}
],
"normalizedVertices": []
},
"mid": "",
"score": 0.0,
"confidence": 0.0,
"topicality": 0.0,
"locations": [],
"properties": []
},
{
"description": "\u0f53",
"boundingPoly": {
"vertices": [
{
"x": 880,
"y": 388
},
{
"x": 880,
"y": 411
},
{
"x": 817,
"y": 411
},
{
"x": 817,
"y": 388
}
],
"normalizedVertices": []
},
"mid": "",
"locale": "",
"score": 0.0,
"confidence": 0.0,
"topicality": 0.0,
"locations": [],
"properties": []
},
{
"description": "\u0f0b",
"boundingPoly": {
"vertices": [
{
"x": 3636,
"y": 520
},
{
"x": 3636,
"y": 529
},
{
"x": 3609,
"y": 529
},
{
"x": 3609,
"y": 520
}
],
"normalizedVertices": []
},
"mid": "",
"locale": "",
"score": 0.0,
"confidence": 0.0,
"topicality": 0.0,
"locations": [],
"properties": []
},
{
"description": "\u0f42",
"boundingPoly": {
"vertices": [
{
"x": 884,
"y": 574
},
{
"x": 884,
"y": 604
},
{
"x": 832,
"y": 604
},
{
"x": 832,
"y": 574
}
],
"normalizedVertices": []
},
"mid": "",
"locale": "",
"score": 0.0,
"confidence": 0.0,
"topicality": 0.0,
"locations": [],
"properties": []
},
{
"description": "\u0f42",
"boundingPoly": {
"vertices": [
{
"x": 884,
"y": 611
},
{
"x": 884,
"y": 672
},
{
"x": 832,
"y": 672
},
{
"x": 832,
"y": 611
}
],
"normalizedVertices": []
},
"mid": "",
"locale": "",
"score": 0.0,
"confidence": 0.0,
"topicality": 0.0,
"locations": [],
"properties": []
}
],
"fullTextAnnotation": {
"pages": [
{
"width": 4344,
"height": 1046,
"blocks": [
{
"boundingBox": {
"vertices": [
{
"x": 880,
"y": 388
},
{
"x": 880,
"y": 411
},
{
"x": 817,
"y": 411
},
{
"x": 817,
"y": 388
}
],
"normalizedVertices": []
},
"paragraphs": [
{
"boundingBox": {
"vertices": [
{
"x": 880,
"y": 388
},
{
"x": 880,
"y": 411
},
{
"x": 817,
"y": 411
},
{
"x": 817,
"y": 388
}
],
"normalizedVertices": []
},
"words": [
{
"boundingBox": {
"vertices": [
{
"x": 880,
"y": 388
},
{
"x": 880,
"y": 411
},
{
"x": 817,
"y": 411
},
{
"x": 817,
"y": 388
}
],
"normalizedVertices": []
},
"symbols": [
{
"property": {
"detectedBreak": {
"type": 5,
"isPrefix": false
},
"detectedLanguages": []
},
"boundingBox": {
"vertices": [
{
"x": 880,
"y": 388
},
{
"x": 880,
"y": 411
},
{
"x": 817,
"y": 411
},
{
"x": 817,
"y": 388
}
],
"normalizedVertices": []
},
"text": "\u0f53",
"confidence": 0.7286599
}
],
"confidence": 0.7286599
}
],
"confidence": 0.7286599
}
],
"blockType": 1,
"confidence": 0.7286599
},
{
"boundingBox": {
"vertices": [
{
"x": 3636,
"y": 520
},
{
"x": 3636,
"y": 529
},
{
"x": 3609,
"y": 529
},
{
"x": 3609,
"y": 520
}
],
"normalizedVertices": []
},
"paragraphs": [
{
"boundingBox": {
"vertices": [
{
"x": 3636,
"y": 520
},
{
"x": 3636,
"y": 529
},
{
"x": 3609,
"y": 529
},
{
"x": 3609,
"y": 520
}
],
"normalizedVertices": []
},
"words": [
{
"boundingBox": {
"vertices": [
{
"x": 3636,
"y": 520
},
{
"x": 3636,
"y": 529
},
{
"x": 3609,
"y": 529
},
{
"x": 3609,
"y": 520
}
],
"normalizedVertices": []
},
"symbols": [
{
"property": {
"detectedBreak": {
"type": 5,
"isPrefix": false
},
"detectedLanguages": []
},
"boundingBox": {
"vertices": [
{
"x": 3636,
"y": 520
},
{
"x": 3636,
"y": 529
},
{
"x": 3609,
"y": 529
},
{
"x": 3609,
"y": 520
}
],
"normalizedVertices": []
},
"text": "\u0f0b",
"confidence": 0.6573698
}
],
"confidence": 0.6573698
}
],
"confidence": 0.6573698
}
],
"blockType": 1,
"confidence": 0.6573698
},
{
"boundingBox": {
"vertices": [
{
"x": 884,
"y": 574
},
{
"x": 885,
"y": 672
},
{
"x": 833,
"y": 672
},
{
"x": 832,
"y": 574
}
],
"normalizedVertices": []
},
"paragraphs": [
{
"boundingBox": {
"vertices": [
{
"x": 884,
"y": 574
},
{
"x": 885,
"y": 672
},
{
"x": 833,
"y": 672
},
{
"x": 832,
"y": 574
}
],
"normalizedVertices": []
},
"words": [
{
"boundingBox": {
"vertices": [
{
"x": 884,
"y": 574
},
{
"x": 884,
"y": 604
},
{
"x": 832,
"y": 604
},
{
"x": 832,
"y": 574
}
],
"normalizedVertices": []
},
"symbols": [
{
"property": {
"detectedBreak": {
"type": 1,
"isPrefix": false
},
"detectedLanguages": []
},
"boundingBox": {
"vertices": [
{
"x": 884,
"y": 574
},
{
"x": 884,
"y": 604
},
{
"x": 832,
"y": 604
},
{
"x": 832,
"y": 574
}
],
"normalizedVertices": []
},
"text": "\u0f42",
"confidence": 0.99997324
}
],
"confidence": 0.99997324
},
{
"boundingBox": {
"vertices": [
{
"x": 884,
"y": 611
},
{
"x": 884,
"y": 672
},
{
"x": 832,
"y": 672
},
{
"x": 832,
"y": 611
}
],
"normalizedVertices": []
},
"symbols": [
{
"property": {
"detectedBreak": {
"type": 5,
"isPrefix": false
},
"detectedLanguages": []
},
"boundingBox": {
"vertices": [
{
"x": 884,
"y": 611
},
{
"x": 884,
"y": 672
},
{
"x": 832,
"y": 672
},
{
"x": 832,
"y": 611
}
],
"normalizedVertices": []
},
"text": "\u0f42",
"confidence": 0.99940836
}
],
"confidence": 0.99940836
}
],
"confidence": 0.9996908
}
],
"blockType": 1,
"confidence": 0.9996908
}
],
"confidence": 0.79524016
}
],
"text": "\u0f53\n\u0f0b\n\u0f42 \u0f42"
},
"faceAnnotations": [],
"landmarkAnnotations": [],
"logoAnnotations": [],
"labelAnnotations": [],
"localizedObjectAnnotations": []
}
thanks! can you tell me which one of these coordinates is supposed to have an height of 0?
I will recreate the bug and send you the screen shot of it.
I don't doubt that there's a division by 0 error, but I think the change you made is just a small patch on what might be a larger issue, so I think we should really understand where the issue comes from because none of the boxes have an height of 0 so there's no reason why there would be a division by 0
Looking at the data again, I think this might come from the coordinates set in a bbox is not always in the same order. For instance at the beginning you have:
"vertices": [
{
"x": 816,
"y": 388
},
{
"x": 3635,
"y": 388
},
{
"x": 3635,
"y": 672
},
{
"x": 816,
"y": 672
}
],
so the order is
43
12
but then you have the more usual pattern:
"vertices": [
{
"x": 884,
"y": 611
},
{
"x": 884,
"y": 672
},
{
"x": 832,
"y": 672
},
{
"x": 832,
"y": 611
}
],
which is
32
41
so I think the issue is in this lines of code that assume the second structure and get the wrong coordinates for the first type (giving a height of 0)
Well... actually @ngawangtrinley that bug might be a great discovery... I think the "normal" boxes are
32
41
and the boxes that are
43
12
are those that are rotated 90°... so this might be a way to detect marginal content and remove it!! @ta4tsering can you detect the rotation of the box and add it to a field in the bbox
class?
Well... actually @ngawangtrinley that bug might be a great discovery... I think the "normal" boxes are
32 41
and the boxes that are
43 12
are those that are rotated 90°... so this might be a way to detect marginal content and remove it!! @ta4tsering can you detect the rotation of the box and add it to a field in the
bbox
class?
You mean following by box orientation? I'm pretty sure we tested that and it doesn't work all the time
oh ok. Do you remember cases where it didn't work?
oh ok. Do you remember cases where it didn't work?
We included the orientation in the equation in page 13-16. I remember going through quite a few samples with @kaldan007. If my memory is good a common issue is that the margin text is interpreted as being part of the horizontal text and is read at portrait boxes while it should be a landscape box from vertical text. Maybe @kaldan007 has more details on the issue
After a discussion with @ngawangtrinley it turns out that this box orientation wasn't discovered before so we should run some tests on it. BBox
should have an angle
integer value between 0
(the default) and 359
(so no negative values), and it should be filled with the value detected from the coordinates order. @ta4tsering do you see what needs to be done?
@eroux lets take the diagonal vertices coordinate. it will solve the issue. we don't need to bother about the orientation at all.
@kaldan007 I don't understand your comment, what do you mean?
this pr is replaced by #214
Input to recreate the bug: 06150003.json.gz 06190003.json.gz