This repository contains the code and implementation details of the CascadeTabNet paper "CascadeTabNet: An approach for end to end table detection and structure recognition from image-based documents"
jsonString = json.dumps(attrDict, indent = 4, sort_keys=True)
with open(r'D:\banmian\data\ICDAR2019_cTDaR\ICDAR2019_cTDaR\test\TRACKB2\coco.json', "w",encoding='UTF-8') as f:
f.write(jsonString)
Path to the txt file (see at the top of this script)
trainFile = r"D:\banmian\data\ICDAR2019_cTDaR\ICDAR2019_cTDaR\test\TRACKB2\dataset.txt"
trainXMLFiles = list()
with open(trainFile, "r") as f:
for line in f:
fileName = line.strip()
print(fileName)
trainXMLFiles.append(fileName + ".xml")
Script for Converting Pascal VOC annotations to Coco Json format
This script shows and example conversion of our table dataset pascal voc
annotations conversion to coco annotations
Usage :
You need to first create a txt file containing names of all pascal voc files
You can use following linux command
ls -1 | sed -e 's/.xml$//' | sort -n > "/path/to/folder/coco.txt"
And then read the comments in this script to understand its working
import os import xml.etree.ElementTree as ET import xmltodict import json from xml.dom import minidom from collections import OrderedDict
def generateVOC2Json(rootDir,xmlFiles): attrDict = dict()
Add categories according to you Pascal VOC annotations
attrDict["categories"]=[{"supercategory":"none","id":1,"name":"Table"}, {"supercategory":"none","id":2,"name":"cell"}, {"supercategory":"none","id":3,"name":"borderless"}
{"supercategory":"none","id":4,"name":"item_name"},
images = list() annotations = list() id1 = 1
Some random variables
cnt_bor = 0 cnt_cell = 0 cnt_bless = 0
Main execution loop
for root, dirs, files in os.walk(rootDir): image_id = 0 for file in xmlFiles: image_id = image_id + 1 if file in files: annotation_path = os.path.abspath(os.path.join(root, file)) image = dict() doc = xmltodict.parse(open(annotation_path,'r',encoding='UTF-8').read()) image['file_name'] = str(doc['annotation']['filename']) image['height'] = int(doc['annotation']['size']['height']) image['width'] = int(doc['annotation']['size']['width']) image['id'] = image_id print("File Name: {} and image_id {}".format(file, image_id)) images.append(image) if 'object' in doc['annotation']: for key,vals in doc['annotation'].items(): if(key=='object'): for value in attrDict["categories"]: if(not isinstance(vals, list)): vals = [vals] for val in vals: if str(val['name']) == value["name"]: annotation = dict() annotation["iscrowd"] = 0 annotation["image_id"] = image_id x1 = int(val["bndbox"]["xmin"]) - 1 y1 = int(val["bndbox"]["ymin"]) - 1 x2 = int(val["bndbox"]["xmax"]) - x1 y2 = int(val["bndbox"]["ymax"]) - y1 annotation["bbox"] = [x1, y1, x2, y2] annotation["area"] = float(x2 * y2) annotation["category_id"] = value["id"]
attrDict["images"] = images
attrDict["annotations"] = annotations attrDict["type"] = "instances"
Printing out some statisticsf
print(len(images)) print("Bordered : ",cnt_bor," Cell : ",cnt_cell," Bless : ",cnt_bless) print(len(annotations))
Save the final JSON file
jsonString = json.dumps(attrDict)
jsonString = json.dumps(attrDict, indent = 4, sort_keys=True) with open(r'D:\banmian\data\ICDAR2019_cTDaR\ICDAR2019_cTDaR\test\TRACKB2\coco.json', "w",encoding='UTF-8') as f: f.write(jsonString)
Path to the txt file (see at the top of this script)
trainFile = r"D:\banmian\data\ICDAR2019_cTDaR\ICDAR2019_cTDaR\test\TRACKB2\dataset.txt" trainXMLFiles = list() with open(trainFile, "r") as f: for line in f: fileName = line.strip() print(fileName) trainXMLFiles.append(fileName + ".xml")
Path to the pascal voc xml files
rootDir = "D:/banmian/data/ICDAR2019_cTDaR/ICDAR2019_cTDaR/test_ground_truth/TRACKB2"
Start execution
generateVOC2Json(rootDir, trainXMLFiles)
:\anaconda\envs\cacsde\python.exe "D:/banmian/CascadeTabNet-master/CascadeTabNet-master/Data Preparation/generateVOC2JSON.py" Traceback (most recent call last): File "D:/banmian/CascadeTabNet-master/CascadeTabNet-master/Data Preparation/generateVOC2JSON.py", line 120, in
generateVOC2Json(rootDir, trainXMLFiles)
File "D:/banmian/CascadeTabNet-master/CascadeTabNet-master/Data Preparation/generateVOC2JSON.py", line 49, in generateVOC2Json
image['file_name'] = str(doc['annotation']['filename'])
KeyError: 'annotation'
cTDaR_t10000