jsvine / pdfplumber

Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables.
MIT License
6.31k stars 647 forks source link

How get merged cells #685

Open hbh112233abc opened 2 years ago

hbh112233abc commented 2 years ago

Please describe, in as much detail as possible, your proposal and how it would improve your experience with pdfplumber. When I use xlrd parse xls file with formating_info=True, then it can get merged_cells, this is good job! so I hope pdfplumber can support this fuction: example: mark cell is merged_cell and it's range (row_start,row_end,col_start,col_end)

hbh112233abc commented 2 years ago
    def humanized_tables(
        self,table_settings: Optional[T_table_settings] = None
    ) ->Optional[List[List[dict]]]:
        tset = TableSettings.resolve(table_settings)
        tables = self.find_tables(tset)

        if len(tables) == 0:
            return None

        def get_merged(
            rows:List[Row],
            row_start:int,
            col_start:int,
        ) -> Optional[T_bbox]:
            """Get merged range

            Args:
                rows (List[Row]): Table.rows
                row_start (int): row index of target cell
                col_start (int): col index of target cell

            Returns:
                Optional[T_bbox]: Tuple of (row_start,row_end,col_start,col_end).
                    If target cell not in merged cell return None
            """
            rows_length = len(rows)
            cols_length = len(rows[0].cells)
            row_end = row_start
            col_end = col_start
            for col in range(col_start+1, cols_length):
                if rows[row_start].cells[col] is None:
                    col_end = col
                else:
                    break
            for row in range(row_start+1, rows_length):
                cols = rows[row].cells[col_start : col_end + 1]
                if set(cols) == {None}:
                    row_end = row
                else:
                    break
            if row_end != row_start or col_end != col_start:
                return (row_start, row_end, col_start, col_end)
            return None

        humanized_tables = []
        for table_index,table in enumerate(tables):
            table_info = {
                'index': table_index,
                'bbox':table.bbox,
                'rows':[],
                'merged_cells':[],
            }
            for row_index,row in enumerate(table.rows):
                row_cells = []
                for col_index,cell in enumerate(row.cells):
                    if cell is None:
                        continue
                    merged = get_merged(table.rows,row_index,col_index)
                    if merged is not None:
                        table_info["merged_cells"].append(merged)
                    words = table.page.crop(cell).extract_words()
                    content = "".join([x["text"] for x in words])
                    row_cells.append(
                        {
                            "bbox": cell,
                            "words": words,
                            "text": content,
                            "row": row_index,
                            "col": col_index,
                            "merged": merged,
                        }
                    )
                table_info['rows'].append(row_cells)
            humanized_tables.append(table_info)

        return humanized_tables

I think above code can realize my idea!

jsvine commented 2 years ago

Hi @hbh112233abc, and thanks for your suggestion. This is a neat idea! I've been thinking about how to improve the representation of tables, and will consider this as one of the possible approaches.

hbh112233abc commented 2 years ago

Hi @hbh112233abc, and thanks for your suggestion. This is a neat idea! I've been thinking about how to improve the representation of tables, and will consider this as one of the possible approaches.

Sorry, @jsvine My code will get error merged_cells when table irregular. So I hope you can give a perfect solution for get merged cells.

hbh112233abc commented 2 years ago

Hi @jsvine , I found new solution for get merged_cells, and it is better and precisely!

1st, we can get rows and cols coord mark like:

def table_row_col_mark(table: pdfplumber.table.Table) -> Dict[str, List[float]]:
    """get table row col mark
        row mark by coord Y
        col mark by coord X

    Args:
        table (pdfplumber.table.Table): table object

    Returns:
        Dict[str,List[float]]: mark result
    """
    rows = table.rows
    rows_value = []
    cols_value = []
    for row in rows:
        for cell in row.cells:
            if cell is None:
                continue
            rows_value.append(cell[1])
            cols_value.append(cell[0])
    rows_value = sorted([round(x, 3) for x in set(rows_value)])
    cols_value = sorted([round(x, 3) for x in set(cols_value)])
    return {
        "rows": rows_value,
        "cols": cols_value,
    }

2nd, we can check cell merged:

def check_merged(
    cell_bbox: Tuple[float, float, float, float],
    row_col_mark: Dict[str, List[float]],
) -> Tuple[int, int, int, int]:
    """check cell merged

    Args:
        cell_bbox (Tuple[float, float, float, float]): cell bbox
        row_col_mark (Dict[str, List[float]]): row col mark

    Returns:
        Tuple[int, int, int, int]: merged tuple(row_start,row_end,col_start,col_end)
    """
    cell = [round(x, 3) for x in cell_bbox]

    rows_y = row_col_mark["rows"]
    cols_x = row_col_mark["cols"]

    try:
        row_start = rows_y.index(cell[1])
        row_end = rows_y.index(cell[3]) - 1
        col_start = cols_x.index(cell[0])
        col_end = cols_x.index(cell[2]) - 1

        if not (row_end == row_start and col_end == col_start):
            return [row_start, row_end, col_start, col_end]

    except Exception as e:
        return None
    return None

if merged is None, cell is not merged cell.

hieudx149 commented 1 year ago

Hello @hbh112233abc, your algorithm is impressive. However, I have noticed that it misses some cases when there are merged cells in the last row or last column. Therefore, I suggest modifying your code as follows::

from typing import Dict, List, Tuple

def check_merged(
    cell_bbox: Tuple[float, float, float, float],
    row_col_mark: Dict[str, List[float]],
) -> Tuple[int, int, int, int]:
    """check cell merged

    Args:
        cell_bbox (Tuple[float, float, float, float]): cell bbox
        row_col_mark (Dict[str, List[float]]): row col mark

    Returns:
        Tuple[int, int, int, int]: merged tuple(row_start, row_end, col_start, col_end)
    """
    cell = [round(x, 3) for x in cell_bbox]

    rows_y = row_col_mark["rows"]
    cols_x = row_col_mark["cols"]
    try:
        row_start = rows_y.index(cell[1])
        if cell[3] > rows_y[-1]:
            row_end = len(rows_y) - 1
        else:
            row_end = rows_y.index(cell[3]) - 1
        col_start = cols_x.index(cell[0])
        if cell[2] > cols_x[-1]:
            col_end = len(cols_x) - 1
        else:
            col_end = cols_x.index(cell[2]) - 1
        if not (row_end == row_start and col_end == col_start):
            return [row_start, row_end, col_start, col_end]

    except Exception as e:
        return None
    return None
hbh112233abc commented 1 year ago

check_merged

row_col_mark contain all cell of the table, if cell[3] > rows_y[-1] or cell[2] > cols_x[-1] I think it no in the table, and then get it's index will raise Exception, so return None, can not check the cell merged

hieudx149 commented 1 year ago

@hbh112233abc, i don't think so, because you only add top, left coordinates of cell into row_col_mark then when a merged cell contain the last cell in row or the last cell in column we should check these case with cell[3] > rows_y[-1] or cell[2] > cols_x[-1]

John-Peter-R commented 1 year ago

Any idea for merged cells within a table in a pdf @hieudx149 @hbh112233abc @jsvine

hbh112233abc commented 1 year ago

@hbh112233abc, i don't think so, because you only add top, left coordinates of cell into row_col_mark then when a merged cell contain the last cell in row or the last cell in column we should check these case with cell[3] > rows_y[-1] or cell[2] > cols_x[-1]

@hieudx149 can you give example pdf as no work, I make a demo,the table pdf like image and test code:

#!/usr/bin/python
# -*- coding: utf-8 -*-

from pathlib import Path
from typing import Dict, List, Tuple

import pdfplumber

def check_merged_cells(
    cell_bbox: Tuple[float, float, float, float],
    row_col_mark: Dict[str, List[float]],
) -> Tuple[int, int, int, int]:
    """check cell is merged cell

    Args:
        cell_bbox (Tuple[float, float, float, float]): cell bbox
        row_col_mark (Dict[str, List[float]]): row col mark

    Returns:
        Tuple[int, int, int, int]: merged tuple(row_start,row_end,col_start,col_end)
    """
    cell = [round(x, 3) for x in cell_bbox]

    rows_y = row_col_mark["rows"]
    cols_x = row_col_mark["cols"]

    try:
        row_start = rows_y.index(cell[1])
        row_end = rows_y.index(cell[3]) - 1
        col_start = cols_x.index(cell[0])
        col_end = cols_x.index(cell[2]) - 1

        if not (row_end == row_start and col_end == col_start):
            return [row_start, row_end, col_start, col_end]

    except Exception as e:
        return None
    return None

def table_row_col_mark(table: pdfplumber.table.Table) -> Dict[str, List[float]]:
    """get the coordinate list of rows,columns in the table

    Args:
        table (pdfplumber.table.Table): table object

    Returns:
        Dict[str,List[float]]: {
            "rows": list of rows coordinate
            "cols": list of columns coordinate
        }
    """
    rows = table.rows
    rows_value = []
    cols_value = []
    for row in rows:
        for cell in row.cells:
            if cell is None:
                continue
            rows_value.append(cell[1])
            rows_value.append(cell[3])
            cols_value.append(cell[0])
            cols_value.append(cell[2])
    rows_value = sorted([round(x, 3) for x in set(rows_value)])
    cols_value = sorted([round(x, 3) for x in set(cols_value)])
    return {
        "rows": rows_value,
        "cols": cols_value,
    }

file = Path(__file__).parent / "merged_cell.pdf"
pdf = pdfplumber.open(file)
page = pdf.pages[0]

merged_cells = []
for table in page.find_tables():
    row_col_mark = table_row_col_mark(table)
    for row_index, row in enumerate(table.rows):
        for col_index, cell in enumerate(row.cells):
            if cell is None:
                continue
            merged = check_merged_cells(cell, row_col_mark)
            if not merged:
                continue
            text = page.within_bbox(cell).extract_text()
            print(text)
            assert text == "{},{}|{},{}".format(*merged)
            merged_cells.append(cell)

assert len(merged_cells) == 14

it run result is correct

demo.zip