Zuhaib121 / PDF-reader

PDF Reader
0 stars 0 forks source link

PDF reader #1

Closed Zuhaib121 closed 7 months ago

Zuhaib121 commented 7 months ago

package com.viveknaskar.pdfgenerator;

import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font;

import java.io.IOException;

public class GeneratePDF {

public static void main(String[] args) {

    String filename = "demoDocument.pdf";
    String message = "Illustration of demo PDF doc created using PDFBox.";

    try (PDDocument doc = new PDDocument()) {
        PDPage page = new PDPage();
        doc.addPage(page);

        PDFont font = PDType1Font.TIMES_ROMAN;

        PDPageContentStream contents = new PDPageContentStream(doc, page);
        contents.beginText();
        contents.setFont(font, 20);
        contents.newLineAtOffset(40, 600);
        contents.showText(message);
        contents.endText();
        contents.close();

        doc.save(filename);
    } catch (IOException e) {
        e.printStackTrace();
    }

}

} package com.viveknaskar.pdfgenerator;

import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font;

import java.io.IOException;

public class GeneratePDF {

public static void main(String[] args) {

    String filename = "demoDocument.pdf";
    String message = "Illustration of demo PDF doc created using PDFBox.";

    try (PDDocument doc = new PDDocument()) {
        PDPage page = new PDPage();
        doc.addPage(page);

        PDFont font = PDType1Font.TIMES_ROMAN;

        PDPageContentStream contents = new PDPageContentStream(doc, page);
        contents.beginText();
        contents.setFont(font, 20);
        contents.newLineAtOffset(40, 600);
        contents.showText(message);
        contents.endText();
        contents.close();

        doc.save(filename);
    } catch (IOException e) {
        e.printStackTrace();
    }

}

} package com.viveknaskar.pdfgenerator;

import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition;

import java.io.*; import java.util.ArrayList; import java.util.Collections; import java.util.List;

public class GetWordsFromPDF extends PDFTextStripper {

static List<String> words = new ArrayList<>();

public GetWordsFromPDF() throws IOException {
}

/**
 * @throws IOException If there is an error parsing the document.
 */
public static void main(String[] args) throws IOException {
    String fileName = "demoDocument.pdf"; // replace with your PDF file name
    try (PDDocument document = PDDocument.load(new File(fileName))) {
        PDFTextStripper stripper = new GetWordsFromPDF();
        stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(document.getNumberOfPages());

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);

        // print words
        for (String word : words) {
            System.out.println(word);
        }
    }
}

/**
 * Override the default functionality of PDFTextStripper.writeString()
 */
@Override
protected void writeString(String str, List<TextPosition> textPositions) {
    String[] wordsInStream = str.split(getWordSeparator());
    Collections.addAll(words, wordsInStream);
}

} package com.viveknaskar.pdfgenerator;

import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition;

import java.io.*; import java.util.ArrayList; import java.util.Collections; import java.util.List;

public class GetWordsFromPDF extends PDFTextStripper {

static List<String> words = new ArrayList<>();

public GetWordsFromPDF() throws IOException {
}

/**
 * @throws IOException If there is an error parsing the document.
 */
public static void main(String[] args) throws IOException {
    String fileName = "demoDocument.pdf"; // replace with your PDF file name
    try (PDDocument document = PDDocument.load(new File(fileName))) {
        PDFTextStripper stripper = new GetWordsFromPDF();
        stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(document.getNumberOfPages());

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);

        // print words
        for (String word : words) {
            System.out.println(word);
        }
    }
}

/**
 * Override the default functionality of PDFTextStripper.writeString()
 */
@Override
protected void writeString(String str, List<TextPosition> textPositions) {
    String[] wordsInStream = str.split(getWordSeparator());
    Collections.addAll(words, wordsInStream);
}

} package com.viveknaskar.pdfgenerator;

import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File; import java.io.IOException;

public class ReadFromPDF {

public static void main(String[] args) {
    try {
        PDDocument doc = PDDocument.load(new File("demoDocument.pdf"));
        String text = new PDFTextStripper().getText(doc);
        System.out.println("The text in the PDF is: \n---------------------------------");
        System.out.println(text);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

}

Zuhaib121 commented 7 months ago

PDF Generator Application

A simple java application (or a series of java classes) that create pdf, read text from pdf, extract words from the pdf and even fetch the coordinates of each word in the document

This application uses open-source Apache PDFBox dependency that allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents.

Reference link: https://pdfbox.apache.org/index.html

Zuhaib121 commented 7 months ago

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">

4.0.0
<parent>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-parent</artifactId>
    <version>2.3.4.RELEASE</version>
    <relativePath/>
</parent>

<groupId>com.viveknaskar.pdfgenerator</groupId>
<artifactId>pdf-generator</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>PDF Generator</name>
<description>Java Application to Create PDF using Apache PDFBox</description>

<dependencies>

    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.22</version>
    </dependency>
</dependencies>

Zuhaib121 commented 7 months ago

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">

4.0.0
<parent>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-parent</artifactId>
    <version>2.3.4.RELEASE</version>
    <relativePath/>
</parent>

<groupId>com.viveknaskar.pdfgenerator</groupId>
<artifactId>pdf-generator</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>PDF Generator</name>
<description>Java Application to Create PDF using Apache PDFBox</description>

<dependencies>

    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.22</version>
    </dependency>
</dependencies>