tabulapdf / tabula-java

Extract tables from PDF files
MIT License
1.82k stars 425 forks source link

Tabula-java is skipping one row from Table #544

Open sikanderrafiq opened 4 months ago

sikanderrafiq commented 4 months ago

HI, I am using Tabula-java to parse Table in pdf file, but it is skipping one row. Alternative row is fetching perperly. I have attached my pdf file named murree_ren.pdf. murree_ren.pdf

This is the code I have used:

public void parse() {

    System.out.println("TabulaPdfParser.parse-----------------------------------");

    try {
        File file = new File("D:/Pdfs/murree_ren.pdf");
        FileInputStream inputStream = new FileInputStream(file);

        PDDocument document = PDDocument.load(inputStream);
        {
            System.out.println("TabulaPdfParser.parse--------------------document loaded---------------");

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            @SuppressWarnings("resource")
            PageIterator pi = new ObjectExtractor(document).extract();

            while (pi.hasNext()) {
                // iterate over the pages of the document
                Page page = pi.next();
                List<Table> tables = sea.extract(page);
                System.out.println("TabulaPdfParser.parse------------------||||-----------------table size=" + tables.size());

                // iterate over the tables of the page
                for(Table table: tables) {
                    List<List<RectangularTextContainer>> rows = table.getRows();
                    System.out.println("TabulaPdfParser.parse------------getRowCount=" + table.getRowCount() + " colcount=" + table.getColCount());

                    String str = "";
                    RectangularTextContainer rect;
                    int rowcount = table.getRowCount();
                    int colcount = table.getColCount();

                    for (int i=0; i<rowcount; i++) {
                        str = "";
                        for (int j=0; j<colcount; j++) {
                            rect = table.getCell(i, j);
                            str = str + rect.getText().replace("\r", " ");
                            if (j  < (colcount-1)) {
                                str += "|";
                            }
                        }
                        System.out.println("RowText:----------row no=" + i + " str=" + str);
                    }
                }
            }

        }

    } catch (Exception ex) {
        System.out.println("Exception:---------------------------------------=" + ex.getMessage());

    }

}

Here is the output:

TabulaPdfParser.parse----------------------------------- TabulaPdfParser.parse--------------------document loaded--------------- TabulaPdfParser.parse------------------||||-----------------table size=109 TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=2 colcount=7 RowText:----------row no=0 str=1|39167 24/11/2019|MUHAMMAD MATLOOB|MUHAMMAD TAJ|VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB|6/6/1976|F.A RowText:----------row no=1 str=|||||| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=2 colcount=7 RowText:----------row no=0 str=3|455 18/06/2020|WAHEED ANWAR|ABDUL QADOUS|P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB|12/9/1954|MATRIC RowText:----------row no=1 str=|||||| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=WAHEED ANWAR| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=ABDUL QADOUS RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=5 RowText:----------row no=0 str=|||| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=11 RowText:----------row no=0 str=5|61134 2020-12-08|AZRA PARVEEN|MUHAMMAD TALIB|V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|30/10/1966|MATRIC|||| RowText:----------row no=1 str=|||||||||| RowText:----------row no=2 str=|||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=61134 2020-12-08 RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=61134 2020-12-08 RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=AZRA PARVEEN| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=8 RowText:----------row no=0 str=7|60305 25/01/2021|AZRA NAHEED|MANZOOR HUSSAIN|BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|17/03/1973|MATRIC| RowText:----------row no=1 str=||||||| RowText:----------row no=2 str=||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MANZOOR HUSSAIN RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MANZOOR HUSSAIN RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=3 colcount=9 RowText:----------row no=0 str=9|59463 31/07/2021|MUNAWAR HUSSAIN|MUHAMMAD ABDULLAH|H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|01/03/1974|MATRIC|| RowText:----------row no=1 str=|||||||| RowText:----------row no=2 str=|||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MUNAWAR HUSSAIN RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MUHAMMAD ABDULLAH RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=4 RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||| RowText:----------row no=1 str=||| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=11 RowText:----------row no=0 str=11|58306 10/01/2022|MUBASHAR ISHAQ QAMAR|MUHAMMAD ISHAQ|VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|26/02/1992|FSC|||| RowText:----------row no=1 str=|||||||||| RowText:----------row no=2 str=|||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MUBASHAR ISHAQ QAMAR RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=4 RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB||| RowText:----------row no=1 str=||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=FSC RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=3 colcount=12 RowText:----------row no=0 str=13|49597 19/06/2022|TAHIR MEHBOOB|MUHAMMAD MEHBOOB|GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB|15/7/1988|ICS , FSC HOMEO||||| RowText:----------row no=1 str=||||||||||| RowText:----------row no=2 str=||||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=2 RowText:----------row no=0 str=TAHIR MEHBOOB| RowText:----------row no=1 str=| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=TAHIR MEHBOOB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=TAHIR MEHBOOB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=4 RowText:----------row no=0 str=MUHAMMAD MEHBOOB||| RowText:----------row no=1 str=||| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=ICS , FSC HOMEO RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=3 RowText:----------row no=0 str=|| TabulaPdfParser.parse------------getRowCount=1 colcount=1 RowText:----------row no=0 str= TabulaPdfParser.parse------------getRowCount=1 colcount=2 RowText:----------row no=0 str=| TabulaPdfParser.parse------------getRowCount=3 colcount=11 RowText:----------row no=0 str=15|53404 19/03/2023|MUHAMMAD AJMAL MALIK|MAHMOOD AHMED MALIK|VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|26/2/1961|MATRIC|||| RowText:----------row no=1 str=|||||||||| RowText:----------row no=2 str=|||||||||| TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=MUHAMMAD AJMAL MALIK|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=MAHMOOD AHMED MALIK RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=|| TabulaPdfParser.parse------------getRowCount=2 colcount=1 RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB RowText:----------row no=1 str= TabulaPdfParser.parse------------getRowCount=2 colcount=3 RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|| RowText:----------row no=1 str=||