Java PDF example code - ExtractStructuredText.java


/*

 * Copyright (c) 1997-2026 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.text;



import org.jpedal.PdfDecoderServer;

import org.jpedal.examples.text.configuration.ExtractStructuredTextProperties;

import org.jpedal.examples.text.configuration.OutputModes;

import org.jpedal.examples.text.output.EPUBStructuredTextWriter;

import org.jpedal.examples.text.output.HTMLStructuredTextWriter;

import org.jpedal.examples.text.output.JSONStructuredTextWriter;

import org.jpedal.examples.text.output.MarkdownStructuredTextWriter;

import org.jpedal.examples.text.output.StructuredTextWriter;

import org.jpedal.examples.text.output.XMLStructuredTextWriter;

import org.jpedal.examples.text.output.YAMLStructuredTextWriter;

import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.utils.LogWriter;

import org.w3c.dom.Document;



import java.io.IOException;







/**

 * <h2>Extract Structured Content (if present) from PDF files</h2>

 * <p>

 * This class provides a simple Java API to extract Structured Content (if present) from a PDF file and also a static

 * convenience method if you just want to dump any structured outlines from a PDF file or directory containing PDF

 * files

 * <p>

 * If no Structure is present a blank file is returned<br>

 * <p>

 * For non-structured files, consider:

 * <ul>

 *     <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextAsWordlist.java.html">ExtractTextAsWordList</a></li>

 *     <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextInRectangle.java.html">ExtractTextInRectangle</a></li>

 * </ul>

 * <a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction</a>

 */

public class ExtractStructuredText extends BaseTextExtraction {



    ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();



    private String figuresFolder;

    private String figuresFormat;

    private float figuresScaling = 1.0f;



    /**

     * Sets up an ExtractStructuredText instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractStructuredText(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractStructuredText instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray Array which will hold BLOB

     */

    public ExtractStructuredText(final byte[] byteArray) {

        super(byteArray);



        init();

    }



    /**

     * Sets up an ExtractStructuredText instance to open a PDF File

     *

     * @param fileName   full path to a single PDF file

     * @param properties ExtractStructuredTextProperties object for configuring extraction

     */

    public ExtractStructuredText(final String fileName, final ExtractStructuredTextProperties properties) {

        super(fileName);



        if (properties != null) {

            this.properties = properties;

        }



        init();

    }



    /**

     * Sets up an ExtractStructuredText instance to open a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray  Array which will hold BLOB

     * @param properties ExtractStructuredTextProperties object for configuring extraction

     */

    public ExtractStructuredText(final byte[] byteArray, final ExtractStructuredTextProperties properties) {

        super(byteArray);



        if (properties != null) {

            this.properties = properties;

        }



        init();

    }



    /**

     * routine to decode a file

     */

    @Override

    public void decodeFile(final String file_name) throws PdfException {



        fileName = file_name;

        if (!openPDFFile()) {

            return;

        }



        final OutputModes mode = properties.getFileOutputMode();



        if (mode == OutputModes.EPUB) {

            try {

                EPUBStructuredTextWriter.write(file_name, outputDir, properties, separator, figuresFolder, figuresFormat, figuresScaling);

            } catch (final Exception e) {

                LogWriter.error(e, e.getMessage());

                throw new PdfException(e.getMessage());

            }

            return;

        }



        //read pages -if you already have code this is probably

        //all you need!

        final Document tree;

        if (figuresFolder != null) {

            try {

                tree = getStructuredTextContentAndFigures(figuresFolder, figuresFormat, figuresScaling);

            } catch (final IOException e) {

                LogWriter.error(e, e.getMessage());

                throw new PdfException(e.getMessage());

            }

        } else {

            tree = getStructuredTextContent();

        }



        if (tree == null) {

            return;

        }



        switch (mode) {

            case XML -> {

                final StructuredTextWriter XMLwriter = new XMLStructuredTextWriter(separator);

                XMLwriter.write(tree, file_name, outputDir);

            }

            case HTML -> {

                final StructuredTextWriter HTMLwriter = new HTMLStructuredTextWriter(separator);

                HTMLwriter.write(tree, file_name, outputDir);

            }

            case MARKDOWN -> {

                final StructuredTextWriter markdownWriter = new MarkdownStructuredTextWriter(separator);

                markdownWriter.write(tree, file_name, outputDir);

            }

            case JSON -> {

                final StructuredTextWriter JSONwriter = new JSONStructuredTextWriter(separator);

                JSONwriter.write(tree, file_name, outputDir);

            }

            case YAML -> {

                final StructuredTextWriter YAMLwriter = new YAMLStructuredTextWriter(separator);

                YAMLwriter.write(tree, file_name, outputDir);

            }

        }

    }



    /**

     * This class will allow you to extract any Structured Text data via command line from a single PDF file or a

     * directory of PDF files.

     * <br>

     * The example expects the following parameters:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is the directory to write out the outline data</li>

     * <li>(Optional, unless Value 4 is present then Value 3 must be present) Value 3 is the outline data file format</li>

     * <li>Value 4 is the directory to write out the figures data</li>

     * <li>(Optional) Value 5 is the figures output format</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        switch (args.length) {

            case 0, 1 -> {

                System.out.println("Example takes 2 parameters");

                System.out.println("Value 1 is the file name or directory of PDF files to process");

                System.out.println("Value 2 is the Directory for writing the outline data as text files");

                System.out.println("(Optional) Value 3 is the outline output format, defaults to xml if not present");

                System.out.println("Value 4 is the Directory for writing the figures data as images");

                System.out.println("(Optional) Value 5 is the figures output format, defaults to jpeg if not present");

                System.out.println("(Optional) Value 6 is the figures image scaling, defaults to 1.0 if not present");

            }

            case 2 -> {

                try {

                    writeAllStructuredTextOutlinesToDir(args[0], args[1]);

                } catch (final PdfException e) {

                    LogWriter.error(e, "Exception thrown while extracting structured text");

                }

            }

            case 3 -> {

                try {

                    final String outputFormat = args[2];

                    final ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();

                    handleOutputFormat(outputFormat, properties);

                    writeAllStructuredTextOutlinesToDir(args[0], "", args[1], null, properties);

                } catch (final PdfException e) {

                    LogWriter.error(e, "Exception thrown while extracting structured text");

                }

            }

            case 4, 5, 6 -> {

                try {

                    final String outputFormat = args[2];

                    final ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();

                    handleOutputFormat(outputFormat, properties);



                    final String figuresFormat = args.length < 5 || args[4] == null ? "jpeg" : args[4];

                    final float figuresScaling = args.length < 6 || args[5] == null ? 1.0f : Float.parseFloat(args[5]);



                    writeAllStructuredTextOutlinesAndFiguresToDir(args[0], "", args[1], null, properties, args[3], figuresFormat, figuresScaling);

                } catch (final PdfException e) {

                    LogWriter.error(e, "Exception thrown while extracting structured text");

                }

            }

            default -> {

                System.out.println("too many arguments entered - run with no values to see defaults");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("you entered:\n" + arguments + "as the arguments");

            }

        }

    }



    /**

     * Set the file output mode based on a string input

     *

     * @param outputFormat the output format as a string

     * @param properties the properties object which will have its output mode set

     *

     * @throws IllegalArgumentException if an invalid output mode is supplied

     */

    private static void handleOutputFormat(final String outputFormat, final ExtractStructuredTextProperties properties) throws IllegalArgumentException {

        switch (outputFormat.toLowerCase()) {

            case "html" -> properties.setFileOutputMode(OutputModes.HTML);

            case "xml" -> properties.setFileOutputMode(OutputModes.XML);

            case "json" -> properties.setFileOutputMode(OutputModes.JSON);

            case "markdown" -> properties.setFileOutputMode(OutputModes.MARKDOWN);

            case "epub" -> properties.setFileOutputMode(OutputModes.EPUB);

            case "yaml" -> properties.setFileOutputMode(OutputModes.YAML);

            default -> {

                final StringBuilder message = new StringBuilder();

                message.append("Output format of \"").append(outputFormat).append(" is not recognised.\n");

                message.append("Valid values are as follows,");

                final OutputModes[] modes = OutputModes.values();

                for (final OutputModes mode : modes) {

                    message.append(mode);

                }

                throw new IllegalArgumentException(message.toString());

            }

        }

    }



    @Override

    void init() {

        decode_pdf = new PdfDecoderServer(false);

        PdfDecoderServer.init(false);

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password  user or owner password for pdf file

     * @param outputDir directory for writing out images

     *

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir) throws PdfException {

        writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, null, null);

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir     directory containing PDF files

     * @param password     user or owner password for pdf file

     * @param outputDir    directory for writing out images

     * @param errorTracker a custom error tracker

     *

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker) throws PdfException {

        writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, errorTracker, null);

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir     directory containing PDF files

     * @param password     user or owner password for pdf file

     * @param outputDir    directory for writing out structured text

     * @param errorTracker a custom error tracker

     * @param properties   a ExtractStructuredTextProperties object for configuration

     *

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker, final ExtractStructuredTextProperties properties) throws PdfException {



        final ExtractStructuredText extract;



        if (properties == null) {

            extract = new ExtractStructuredText(inputDir);

        } else {

            extract = new ExtractStructuredText(inputDir, properties);

        }

        if (password != null) {

            extract.setPassword(password);

        }



        if (errorTracker != null) {

            extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

        }



        extract.setup(outputDir);



        extract.processFiles(inputDir);



        extract.closePDFfile();

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir      directory containing PDF files

     * @param password      user or owner password for pdf file

     * @param outputDir     directory for writing out structured text

     * @param errorTracker  a custom error tracker

     * @param properties    a ExtractStructuredTextProperties object for configuration

     * @param figuresDir    directory for writing out figures

     * @param figuresFormat image file format for writing figures

     *

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesAndFiguresToDir(final String inputDir, final String password,

            final String outputDir, final ErrorTracker errorTracker,

            final ExtractStructuredTextProperties properties,

            final String figuresDir, final String figuresFormat) throws PdfException {



        writeAllStructuredTextOutlinesAndFiguresToDir(inputDir, password, outputDir, errorTracker, properties, figuresDir, figuresFormat, 1.0f);

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir       directory containing PDF files

     * @param password       user or owner password for pdf file

     * @param outputDir      directory for writing out structured text

     * @param errorTracker   a custom error tracker

     * @param properties     a ExtractStructuredTextProperties object for configuration

     * @param figuresDir     directory for writing out figures

     * @param figuresFormat  image file format for writing figures

     * @param figuresScaling scaling value for writing figures images

     *

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesAndFiguresToDir(final String inputDir, final String password,

            final String outputDir, final ErrorTracker errorTracker,

            final ExtractStructuredTextProperties properties,

            final String figuresDir, final String figuresFormat, final float figuresScaling) throws PdfException {



        final ExtractStructuredText extract;



        if (properties == null) {

            extract = new ExtractStructuredText(inputDir);

        } else {

            extract = new ExtractStructuredText(inputDir, properties);

        }

        if (password != null) {

            extract.setPassword(password);

        }

        if (figuresDir != null) {

            extract.figuresFolder = figuresDir;

        }

        extract.figuresFormat = figuresFormat;

        extract.figuresScaling = figuresScaling;



        if (errorTracker != null) {

            extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

        }



        extract.setup(outputDir);



        extract.processFiles(inputDir);



        extract.closePDFfile();

    }



    /**

     * Convenience method to write any Structured text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param outputDir directory for writing out images

     *

     * @throws org.jpedal.exception.PdfException a PDF exception

     */

    public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String outputDir) throws PdfException {

        writeAllStructuredTextOutlinesToDir(inputDir, null, outputDir, null, null);

    }



    /**

     * Set up the text extraction.

     * Configure output directory with correct trailing separator.

     *

     * @param outputDir the output directory

     */

    private void setup(String outputDir) {



        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        this.outputDir = outputDir;

    }



    /**

     * gets the Document containing any Structured text (if present) as a Document structure

     * <br>

     * If the Document does not contain the meta data for Structured Content, an empty Document is returned

     *

     * @return Document

     */

    public Document getStructuredTextContent() {

        return decode_pdf.getMarkedContent();

    }



    /**

     * gets the Document containing any Structured text (if present) per page, as an array of Documents

     * <br>

     * If the Document does not contain the meta data for Structured Content, an empty Document is returned

     *

     * @return Document

     */

    public Document[] getStructuredTextContentPerPage() {

        return decode_pdf.getMarkedContentPerPage();

    }



    /**

     * Gets the marked content from the Document and also writes out the figures to a supplied directory

     *

     * @param figureDir The directory to write the figure images

     * @param imageFormat The image format for the figure images

     *

     * @return The marked content document

     *

     * @throws IOException If there is a problem with writing the images

     */

    public Document getStructuredTextContentAndFigures(final String figureDir, final String imageFormat) throws IOException {

        return decode_pdf.getMarkedContent(figureDir, imageFormat);

    }



    /**

     * Gets the marked content from the Document and also writes out the figures to a supplied directory

     *

     * @param figureDir The directory to write the figure images

     * @param imageFormat The image format for the figure images

     * @param imageScaling The scaling for the figure images

     *

     * @return The marked content document

     *

     * @throws IOException If there is a problem with writing the images

     */

    public Document getStructuredTextContentAndFigures(final String figureDir, final String imageFormat, final float imageScaling) throws IOException {

        return decode_pdf.getMarkedContent(figureDir, imageFormat, imageScaling);

    }



    /**

     * Gets the marked content from the Document and also writes out the figures to a supplied directory

     *

     * @param figureDir The directory to write the figure images

     * @param imageFormat The image format for the figure images

     *

     * @return The marked content document as an array with each page per element

     *

     * @throws IOException If there is a problem with writing the images

     */

    public Document[] getStructuredTextContentAndFiguresPerPage(final String figureDir, final String imageFormat) throws IOException {

        return decode_pdf.getMarkedContentPerPage(figureDir, imageFormat);

    }



    /**

     * Gets the marked content from the Document and also writes out the figures to a supplied directory

     *

     * @param figureDir The directory to write the figure images

     * @param imageFormat The image format for the figure images

     * @param imageScaling The scaling for the figure images

     *

     * @return The marked content document as an array with each page per element

     *

     * @throws IOException If there is a problem with writing the images

     */

    public Document[] getStructuredTextContentAndFiguresPerPage(final String figureDir, final String imageFormat, final float imageScaling) throws IOException {

        return decode_pdf.getMarkedContentPerPage(figureDir, imageFormat, imageScaling);

    }



}