geeksforgeeks.org / Java Program to Extract Content from a PDF

Java Program to Extract Content from a PDF https://www.geeksforgeeks.org/java-program-to-extract-content-from-a-pdf/

// Java Program to Extract Content from a PDF

// Importing java input/output classes
import java.io.File;
import java.io.FileInputStream;
// Importing Apache POI classes
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;

// Class 
public class GFG {

    // Main driver method
    public static void main(String[] args) throws Exception
    {

        // Create a content handler
        BodyContentHandler contenthandler
            = new BodyContentHandler();

        // Create a file in local directory
        File f = new File("C:/extractcontent.pdf");

        // Create a file input stream
        // on specified path with the created file
        FileInputStream fstream = new FileInputStream(f);

        // Create an object of type Metadata to use
        Metadata data = new Metadata();

        // Create a context parser for the pdf document
        ParseContext context = new ParseContext();

        // PDF document can be parsed using the PDFparser
        // class
        PDFParser pdfparser = new PDFParser();

        // Method parse invoked on PDFParser class
        pdfparser.parse(fstream, contenthandler, data,
                        context);

        // Printing the contents of the pdf document
        // using toString() method in java
        System.out.println("Extracting contents :"
                        + contenthandler.toString());
    }
}

beroca / shell

geeksforgeeks.org / Java Program to Extract Content from a PDF #23