Java PDF example code - ExtractStructuredText.java


/*

 * Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.text;



import org.jpedal.examples.text.output.HTMLStructuredTextWriter;

import org.jpedal.examples.text.output.XMLStructuredTextWriter;

import org.jpedal.examples.text.output.StructuredTextWriter;

import org.jpedal.examples.text.configuration.ExtractStructuredTextProperties;

import org.jpedal.examples.text.configuration.OutputModes;

import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.utils.LogWriter;

import org.w3c.dom.Document;





/**

 * <h2>Extract Structured Content (if present) from PDF files</h2>

 * <br>

 * This class provides a simple Java API to extract Structured Content (if present) from a PDF file and also a static convenience method if you just want to dump any structured outlines from a PDF file or directory containing PDF files<br>

 * If no Structure is present a blank file is returned<br>

 * <br>For non-structured files, consider:

 * <ul>

 * <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextAsWordlist.java.html">ExtractTextAsWordList</a></li>

 * <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextInRectangle.java.html">ExtractTextInRectangle</a></li>

 * </ul>

 * <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction</a><br>

 */

public class ExtractStructuredText extends BaseTextExtraction {



    ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();



    /**

     * Sets up an ExtractStructuredText instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractStructuredText(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractStructuredText instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray Array which will hold BLOB

     */

    public ExtractStructuredText(final byte[] byteArray) {

        super(byteArray);



        init();

    }



    /**

     * Sets up an ExtractStructuredText instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     * @param properties ExtractStructuredTextProperties object for configuring

     * extraction

     */

    public ExtractStructuredText(final String fileName, final ExtractStructuredTextProperties properties) {

        super(fileName);



        if (properties != null) {

            this.properties = properties;

        }



        init();

    }



    /**

     * Sets up an ExtractStructuredText instance to open a PDF file contained as

     * a BLOB within a byte[] stream

     *

     * @param byteArray Array which will hold BLOB

     * @param properties ExtractStructuredTextProperties object for configuring

     * extraction

     */

    public ExtractStructuredText(final byte[] byteArray, final ExtractStructuredTextProperties properties) {

        super(byteArray);



        if (properties != null) {

            this.properties = properties;

        }



        init();

    }



    /**

     * routine to decode a file

     */

    @Override

    void decodeFile(final String file_name) throws PdfException {



        fileName = file_name;

        if (openPDFFile()) {



            //read pages -if you already have code this is probably

            //all you need!

            final Document tree = getStructuredTextContent();



            if (tree != null) {



                final OutputModes x = properties.getFileOutputMode();

                switch (x) {

                    case XML:



                        final StructuredTextWriter XMLwriter = new XMLStructuredTextWriter(separator);

                        XMLwriter.write(tree, file_name, output_dir);

                        break;



                    case HTML:

                        final StructuredTextWriter HTMLwriter = new HTMLStructuredTextWriter(separator);

                        HTMLwriter.write(tree, file_name, output_dir);

                        break;

                    default:

                        break;

                }

            }

        }

    }



    /**

     * This class will allow you to extract any Structured Text data via command line from a single PDF file or a directory of PDF files.

     * <br>

     * The example expects two or three parameters:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is directory to write out the outline data</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        final int len = args.length;

        switch (len) {

            case 0:

            case 1:

                System.out.println("Example takes 2 parameters");

                System.out.println("Value 1 is the file name or directory of PDF files to process");

                System.out.println("Value 2 is Directory for writing the data as text files");

                System.out.println("(Optional) Value 3 is the output format, defaults to xml if not present");

                System.exit(0);

            case 2:

                try {

                    writeAllStructuredTextOutlinesToDir(args[0], args[1]);

                } catch (final PdfException e) {

                    LogWriter.writeLog(e);

                }

                break;

            case 3:

                try {

                    final String outputFormat = args[2];

                    final ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();

                    switch (outputFormat.toLowerCase()) {

                        case "html" :

                            properties.setFileOutputMode(OutputModes.HTML);

                            break;

                        case "xml" :

                            properties.setFileOutputMode(OutputModes.XML);

                            break;

                        default:

                            final StringBuilder message = new StringBuilder();

                            message.append("Output format of \"").append(outputFormat).append(" is not recognised.\n");

                            message.append("Valid values are as follows,");

                            final OutputModes[] modes = OutputModes.values();

                            for (final OutputModes mode : modes) {

                                message.append(mode);

                            }

                            throw new IllegalArgumentException(message.toString());

                    }

                    writeAllStructuredTextOutlinesToDir(args[0], "", args[1], null, properties);

                } catch (final PdfException e) {

                    LogWriter.writeLog(e);

                }

                break;

            default:

                System.out.println("too many arguments entered - run with no values to see defaults");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("you entered:\n" + arguments + "as the arguments");



                System.exit(0);

        }

    }



    @Override

    void init() {



        type = BaseTextExtraction.ExtractTypes.STRUCTURED_TEXT;



        super.init();



    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for pdf file

     * @param outputDir directory for writing out images

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir) throws PdfException {

        writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, null, null);

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for pdf file

     * @param outputDir directory for writing out images

     * @param errorTracker a custom error tracker

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker) throws PdfException {

        writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, errorTracker, null);

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for pdf file

     * @param outputDir directory for writing out images

     * @param errorTracker a custom error tracker

     * @param properties a ExtractStructuredTextProperties object for configuration

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker, final ExtractStructuredTextProperties properties) throws PdfException {



        final ExtractStructuredText extract;



        if (properties == null) {

            extract = new ExtractStructuredText(inputDir);

        } else {

            extract = new ExtractStructuredText(inputDir, properties);

        }

        if (password != null) {

            extract.setPassword(password);

        }



        if (errorTracker != null) {

            extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

        }



        extract.setup(outputDir);



        extract.processFiles(inputDir);



        extract.closePDFfile();

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param outputDir directory for writing out images

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String outputDir) throws PdfException {

        writeAllStructuredTextOutlinesToDir(inputDir, null, outputDir, null, null);

    }



    private void setup(String outputDir) {



        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        output_dir = outputDir;

    }



    /**

     * gets the Document containing any Structured text (if present) as a Document structure

     * <br>

     * If the Document does not contain the meta data for Structured Content, an empty Document is returned

     *

     * @return Document

     */

    public Document getStructuredTextContent() {

        return decode_pdf.getMarkedContent();

    }

}