Closed ejlee95 closed 1 year ago
HI @ejlee95 , that part hasn't been done.. for that u need to pass each bbox information from xml file to OCR.
`from xmlutils.xml2csv import xml2csv
import cv2
import io
import requests
import pytesseract
import multiprocessing as mp
import pandas as pd
def convert_xml_to_table_cell(path_to_xml):
csv_path = path_to_xml.replace("xml", "csv")
converter = xml2csv(path_to_xml, csv_path, encoding="utf-8")
converter.convert(tag="tablecell")
return csv_path
def image_to_array(path_to_the_image):
img = cv2.imread(path_to_the_image, 0)
return img
def text_extraction(img_path):
text = pytesseract.image_to_string(img_path)
return text
def worker(img_path):
return text_extraction(img_path)
def parallel_processing_of_all_pos(path_to_csv_file, img_array):
df = pd.read_csv(path_to_csv_file)
all_position_lst = []
for item_number in range(df.shape[0]):
x0, x1, y0, y1 = df.iloc[item_number]['x0'], df.iloc[item_number]['x1'], df.iloc[item_number]['y0'], \
df.iloc[item_number]['y1']
all_position_lst.append((x0, x1, y0, y1))
num_core = 6
pool = mp.Pool(num_core)
list_of_results = pool.map(worker, (img_array[obj[2]:obj[3], obj[0]:obj[1]] for obj in all_position_lst))
pool.close()
pool.join()
return list_of_results
def preparing_table_structure(results_lst, csv_path):
df = pd.read_csv(csv_path)
flat_list = [item for sublist in results_lst for item in sublist]
flat_list = results_lst
text_only = pd.DataFrame(flat_list, columns=['text'])
bigdata = pd.concat([df, text_only], axis=1)
_max = bigdata['end_row'].max()
mm = sorted(bigdata['end_col'].unique().tolist())
grouped = bigdata.groupby('end_col')
final_list = []
for ele in mm:
df_temp = grouped.get_group(ele).reset_index()
com_text = []
for ele in range(0, _max + 1):
temp_text = []
position_to_check = df_temp.loc[df_temp['end_row'] == ele]
if position_to_check.shape[0] > 0:
for text_ele in range(position_to_check.shape[0]):
temp_text.append(position_to_check.iloc[text_ele]['text'])
com_text.append("".join(temp_text))
else:
com_text.append("")
final_list.append(tuple(com_text))
return final_list
def console_ops_for_table_structure(xml_path, image_path):
returned_csv_path = convert_xml_to_table_cell(xml_path)
im_array = image_to_array(image_path)
all_res = parallel_processing_of_all_pos(returned_csv_path, im_array)
print(all_res)
interim_result_list = preparing_table_structure(all_res, returned_csv_path)
df = []
for ele in range(len(interim_result_list)):
df.append(pd.DataFrame(interim_result_list[ele]))
final_result_list = pd.concat(df, axis=1)
return final_result_list
if __name__ == "__main__":
path_image = "test.jpg"
path_xml = "test.xml"
observed_res = console_ops_for_table_structure(path_xml, path_image)
print(observed_res)#
`
may be u can try this and let know if this is working fine for your case.
Hi, Could you help me run the evaluation script? It says "exceeds 10% of system memory." and exits itself on google colab. Here is the screenshot.
I am not sure what I am doing wrong.
Here is the link to the notebook TabStructNet
I have spent too much time than I should, Please help. Thank you
Hi @martian1231 , I don't think that's possible there as it has minimum memory and resource will get exhausted. Can you try it in your CPU with better memory, as keras model predict need to get the row and column adjacency matrix too which will consume much of your memory so won't be possible with the infrastructure provided in colab
Hi @sreejith3534, Even when using CPU (which is default I guess) it is throwing the same warning + "resourceExhaustedError" which terminates the program. I have 15.7 GB of useable RAM and I have changed few settings in the config file (like setting TRAIN_ROIS_PER_IMAGE to lower value) hoping it would work. Here is the screengrab:
I am already running out of time, heavy-hearted I might have to give up on this repository and move on to something else.
Thank you for your help.
`from xmlutils.xml2csv import xml2csv import cv2 import io import requests import pytesseract import multiprocessing as mp import pandas as pd def convert_xml_to_table_cell(path_to_xml): csv_path = path_to_xml.replace("xml", "csv") converter = xml2csv(path_to_xml, csv_path, encoding="utf-8") converter.convert(tag="tablecell") return csv_path def image_to_array(path_to_the_image): img = cv2.imread(path_to_the_image, 0) return img def text_extraction(img_path): text = pytesseract.image_to_string(img_path) return text def worker(img_path): return text_extraction(img_path) def parallel_processing_of_all_pos(path_to_csv_file, img_array): df = pd.read_csv(path_to_csv_file) all_position_lst = [] for item_number in range(df.shape[0]): x0, x1, y0, y1 = df.iloc[item_number]['x0'], df.iloc[item_number]['x1'], df.iloc[item_number]['y0'], \ df.iloc[item_number]['y1'] all_position_lst.append((x0, x1, y0, y1)) num_core = 6 pool = mp.Pool(num_core) list_of_results = pool.map(worker, (img_array[obj[2]:obj[3], obj[0]:obj[1]] for obj in all_position_lst)) pool.close() pool.join() return list_of_results def preparing_table_structure(results_lst, csv_path): df = pd.read_csv(csv_path) flat_list = [item for sublist in results_lst for item in sublist] flat_list = results_lst text_only = pd.DataFrame(flat_list, columns=['text']) bigdata = pd.concat([df, text_only], axis=1) _max = bigdata['end_row'].max() mm = sorted(bigdata['end_col'].unique().tolist()) grouped = bigdata.groupby('end_col') final_list = [] for ele in mm: df_temp = grouped.get_group(ele).reset_index() com_text = [] for ele in range(0, _max + 1): temp_text = [] position_to_check = df_temp.loc[df_temp['end_row'] == ele] if position_to_check.shape[0] > 0: for text_ele in range(position_to_check.shape[0]): temp_text.append(position_to_check.iloc[text_ele]['text']) com_text.append("".join(temp_text)) else: com_text.append("") final_list.append(tuple(com_text)) return final_list def console_ops_for_table_structure(xml_path, image_path): returned_csv_path = convert_xml_to_table_cell(xml_path) im_array = image_to_array(image_path) all_res = parallel_processing_of_all_pos(returned_csv_path, im_array) print(all_res) interim_result_list = preparing_table_structure(all_res, returned_csv_path) df = [] for ele in range(len(interim_result_list)): df.append(pd.DataFrame(interim_result_list[ele])) final_result_list = pd.concat(df, axis=1) return final_result_list if __name__ == "__main__": path_image = "test.jpg" path_xml = "test.xml" observed_res = console_ops_for_table_structure(path_xml, path_image) print(observed_res)# `
may be u can try this and let know if this is working fine for your case.
Sorry for checking it lately...
I copy your code in the xml_generating_postprocessor folder, and I got the following error.
I couldn't find 'xmlutils.py' in all folders.
Thanks for your help.
Hi @sreejith3534, Even when using CPU (which is default I guess) it is throwing the same warning + "resourceExhaustedError" which terminates the program. I have 15.7 GB of useable RAM and I have changed few settings in the config file (like setting TRAIN_ROIS_PER_IMAGE to lower value) hoping it would work. Here is the screengrab:
I am already running out of time, heavy-hearted I might have to give up on this repository and move on to something else.
Thank you for your help.
This paper would help. https://openreview.net/forum?id=4tyWL6P08yY
`from xmlutils.xml2csv import xml2csv import cv2 import io import requests import pytesseract import multiprocessing as mp import pandas as pd def convert_xml_to_table_cell(path_to_xml): csv_path = path_to_xml.replace("xml", "csv") converter = xml2csv(path_to_xml, csv_path, encoding="utf-8") converter.convert(tag="tablecell") return csv_path def image_to_array(path_to_the_image): img = cv2.imread(path_to_the_image, 0) return img def text_extraction(img_path): text = pytesseract.image_to_string(img_path) return text def worker(img_path): return text_extraction(img_path) def parallel_processing_of_all_pos(path_to_csv_file, img_array): df = pd.read_csv(path_to_csv_file) all_position_lst = [] for item_number in range(df.shape[0]): x0, x1, y0, y1 = df.iloc[item_number]['x0'], df.iloc[item_number]['x1'], df.iloc[item_number]['y0'], \ df.iloc[item_number]['y1'] all_position_lst.append((x0, x1, y0, y1)) num_core = 6 pool = mp.Pool(num_core) list_of_results = pool.map(worker, (img_array[obj[2]:obj[3], obj[0]:obj[1]] for obj in all_position_lst)) pool.close() pool.join() return list_of_results def preparing_table_structure(results_lst, csv_path): df = pd.read_csv(csv_path) flat_list = [item for sublist in results_lst for item in sublist] flat_list = results_lst text_only = pd.DataFrame(flat_list, columns=['text']) bigdata = pd.concat([df, text_only], axis=1) _max = bigdata['end_row'].max() mm = sorted(bigdata['end_col'].unique().tolist()) grouped = bigdata.groupby('end_col') final_list = [] for ele in mm: df_temp = grouped.get_group(ele).reset_index() com_text = [] for ele in range(0, _max + 1): temp_text = [] position_to_check = df_temp.loc[df_temp['end_row'] == ele] if position_to_check.shape[0] > 0: for text_ele in range(position_to_check.shape[0]): temp_text.append(position_to_check.iloc[text_ele]['text']) com_text.append("".join(temp_text)) else: com_text.append("") final_list.append(tuple(com_text)) return final_list def console_ops_for_table_structure(xml_path, image_path): returned_csv_path = convert_xml_to_table_cell(xml_path) im_array = image_to_array(image_path) all_res = parallel_processing_of_all_pos(returned_csv_path, im_array) print(all_res) interim_result_list = preparing_table_structure(all_res, returned_csv_path) df = [] for ele in range(len(interim_result_list)): df.append(pd.DataFrame(interim_result_list[ele])) final_result_list = pd.concat(df, axis=1) return final_result_list if __name__ == "__main__": path_image = "test.jpg" path_xml = "test.xml" observed_res = console_ops_for_table_structure(path_xml, path_image) print(observed_res)# `
may be u can try this and let know if this is working fine for your case.
Sorry for checking it lately...
I copy your code in the xml_generating_postprocessor folder, and I got the following error.
I couldn't find 'xmlutils.py' in all folders.
Thanks for your help.
pip install xmlutils
`from xmlutils.xml2csv import xml2csv import cv2 import io import requests import pytesseract import multiprocessing as mp import pandas as pd def convert_xml_to_table_cell(path_to_xml): csv_path = path_to_xml.replace("xml", "csv") converter = xml2csv(path_to_xml, csv_path, encoding="utf-8") converter.convert(tag="tablecell") return csv_path def image_to_array(path_to_the_image): img = cv2.imread(path_to_the_image, 0) return img def text_extraction(img_path): text = pytesseract.image_to_string(img_path) return text def worker(img_path): return text_extraction(img_path) def parallel_processing_of_all_pos(path_to_csv_file, img_array): df = pd.read_csv(path_to_csv_file) all_position_lst = [] for item_number in range(df.shape[0]): x0, x1, y0, y1 = df.iloc[item_number]['x0'], df.iloc[item_number]['x1'], df.iloc[item_number]['y0'], \ df.iloc[item_number]['y1'] all_position_lst.append((x0, x1, y0, y1)) num_core = 6 pool = mp.Pool(num_core) list_of_results = pool.map(worker, (img_array[obj[2]:obj[3], obj[0]:obj[1]] for obj in all_position_lst)) pool.close() pool.join() return list_of_results def preparing_table_structure(results_lst, csv_path): df = pd.read_csv(csv_path) flat_list = [item for sublist in results_lst for item in sublist] flat_list = results_lst text_only = pd.DataFrame(flat_list, columns=['text']) bigdata = pd.concat([df, text_only], axis=1) _max = bigdata['end_row'].max() mm = sorted(bigdata['end_col'].unique().tolist()) grouped = bigdata.groupby('end_col') final_list = [] for ele in mm: df_temp = grouped.get_group(ele).reset_index() com_text = [] for ele in range(0, _max + 1): temp_text = [] position_to_check = df_temp.loc[df_temp['end_row'] == ele] if position_to_check.shape[0] > 0: for text_ele in range(position_to_check.shape[0]): temp_text.append(position_to_check.iloc[text_ele]['text']) com_text.append("".join(temp_text)) else: com_text.append("") final_list.append(tuple(com_text)) return final_list def console_ops_for_table_structure(xml_path, image_path): returned_csv_path = convert_xml_to_table_cell(xml_path) im_array = image_to_array(image_path) all_res = parallel_processing_of_all_pos(returned_csv_path, im_array) print(all_res) interim_result_list = preparing_table_structure(all_res, returned_csv_path) df = [] for ele in range(len(interim_result_list)): df.append(pd.DataFrame(interim_result_list[ele])) final_result_list = pd.concat(df, axis=1) return final_result_list if __name__ == "__main__": path_image = "test.jpg" path_xml = "test.xml" observed_res = console_ops_for_table_structure(path_xml, path_image) print(observed_res)# `
may be u can try this and let know if this is working fine for your case.
Sorry for checking it lately... I copy your code in the xml_generating_postprocessor folder, and I got the following error. I couldn't find 'xmlutils.py' in all folders. Thanks for your help.
pip install xmlutils
Ah..! Thanks for your reply. Sorry for asking that kind of question..
After installing xmlutils and pytesseract, it still had errors on Pool.map() like the following image,
so I changed multiprocessing -> pathos.multiprocessing, and it works. """"""""""""""""""""""""""""""""""""""""""""" from pathos.multiprocessing import ProcessingPool as Pool .... def parallel_processing_of_all_pos(path_to_csv_file, img_array): 34 df = pd.read_csv(path_to_csv_file) 35 all_position_lst = [] 36 for item_number in range(df.shape[0]): 37 x0, x1, y0, y1 = df.iloc[item_number]['x0'], df.iloc[item_number]['x1'], df.iloc[item_number]['y0'], \ 38 df.iloc[item_number]['y1'] 39 all_position_lst.append((x0, x1, y0, y1)) 40 num_core = 6 41 pool = Pool(num_core) #mp.Pool(num_core) 42 list_of_results = pool.map(worker, (img_array[obj[2]:obj[3], obj[0]:obj[1]] for obj in all_position_lst)) 43 44 pool.close() 45 pool.join() 46 return list_of_results """"""""""""""""""""""""""""""""""""""""""
Hi, thanks for your code.
While trying to evaluate your code, I found that the output XML files don't contain the contents in the cells. (files in xml_generating_postprocessor/processed_xml folder)
If available, could you update the code for extracting the OCR outputs..?
Thanks for considering my request.