Java PDF example code - ExtractOutline.java


/*

 * Copyright (c) 1997-2026 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.text;



import org.jpedal.PdfDecoderServer;

import org.jpedal.exception.PdfException;

import org.jpedal.utils.LogWriter;

import org.jpedal.utils.SecureTransformerFactory;

import org.w3c.dom.Document;



import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerException;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import javax.xml.transform.stream.StreamSource;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;



/**

 * <h2>Outline Object Data Extraction from PDF files</h2>

 * <p>

 * This class provides a simple Java API to extract data in the Outline Data object (if present) from a PDF file as a Document object

 * and also a static convenience method if you just want to dump all the outlines from a PDF file or directory containing PDF files

 * <p>

 * <a href="https://www.idrsolutions.com/docs/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction.</a>

 */

public class ExtractOutline extends BaseTextExtraction {



    /**

     * Sets up an ExtractOutline instance to open a PDF File

     *

     * @param file path to a single PDF file

     */

    public ExtractOutline(final File file) {

        this(file.getAbsolutePath());

    }



    /**

     * Sets up an ExtractOutline instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractOutline(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractOutline instance to open a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ExtractOutline(final byte[] byteArray) {

        super(byteArray);



        init();

    }



    /**

     * routine to decode a file

     */

    @Override

    public void decodeFile(final String file_name) throws PdfException {



        fileName = file_name;

        if (!openPDFFile()) {

            return;

        }



        final Document outline = getPDFTextOutline();



        final Transformer transformer;

        try (InputStream stylesheet = getClass().getResourceAsStream("/org/jpedal/examples/text/xmlstyle.xslt")) {



            final TransformerFactory tFactory = SecureTransformerFactory.newInstance();

            transformer = tFactory.newTransformer(new StreamSource(stylesheet));



            //get just the name of the file without the path to use as a sub-directory or .pdf

            String name = "demo"; //set a default just in case



            final int pointer = file_name.lastIndexOf(separator);



            if (pointer != -1) {

                name = file_name.substring(pointer + 1, file_name.length() - 4);

            }



            final String outputDir = this.outputDir + name + separator;





            final DOMSource source = new DOMSource(outline);

            if (source.getNode() != null) {

                final File output = new File(outputDir + "outline.xml");



                if (!output.exists()) {

                    final File createDir = new File(outputDir);

                    createDir.mkdirs();

                    output.createNewFile();

                }

                final FileOutputStream fos = new FileOutputStream(output);

                final StreamResult result = new StreamResult(fos);

                transformer.transform(source, result);

            }

        } catch (final IOException | TransformerException e) {

            throw new PdfException(e.getMessage(), e);

        }

    }



    /**

     * This class will allow you to extract any Outline data via command line from a single PDF file or a directory of PDF files.

     * <br>

     * The example expects two:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is directory to write out the outline data</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        final int len = args.length;

        switch (len) {

            case 0 -> {

                System.out.println("Example takes 2 parameters");

                System.out.println("Value 1 is the file name or directory of PDF files to process");

                System.out.println("Value 2 is Directory for writing the data as text files");

                System.exit(0);

            }

            case 2 -> {

                try {

                    writeAllOutlinesToDir(args[0], args[1]);

                } catch (final PdfException e) {

                    LogWriter.error(e, "Exception thrown while extracting file Outline");

                }

            }

            default -> {

                System.out.println("too many arguments entered - run with no values to see defaults");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("you entered:\n" + arguments + "as the arguments");



                System.exit(0);

            }

        }

    }



    @Override

    void init() {

        //PdfDecoder returns a PdfException if there is a problem

        decode_pdf = new PdfDecoderServer(false);

        decode_pdf.setExtractionMode(PdfDecoderServer.TEXT); //extract just text

        PdfDecoderServer.init(true);

        //make sure widths in data CRITICAL if we want to split lines correctly!!

    }



    /**

     * Convenience method to write all the Outlines in a directory of PDF files

     *

     * @param input  directory containing PDF files

     * @param outputDir directory for writing out data

     * @throws org.jpedal.exception.PdfException A Pdf Exception

     */

    public static void writeAllOutlinesToDir(final String input, final String outputDir) throws PdfException {

        final ExtractOutline extract = new ExtractOutline(input);



        extract.setup(outputDir);



        extract.processFiles(input);



        extract.closePDFfile();

    }





    /**

     * Convenience method to write all the Outlines in a directory of PDF files

     *

     * @param input  file or directory containing PDF files

     * @param password  password to be used to open files

     * @param outputDir directory for writing out data

     * @throws org.jpedal.exception.PdfException A Pdf Exception

     */

    public static void writeAllOutlinesToDir(final String input, final String password, final String outputDir) throws PdfException {

        final ExtractOutline extract = new ExtractOutline(input);



        if (password != null) {

            extract.setPassword(password);

        }



        extract.setup(outputDir);



        extract.processFiles(input);



        extract.closePDFfile();

    }



    private void setup(String outputDir) {



        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        this.outputDir = outputDir;

    }



    /**

     * gets the Document Outline object (if present) as a Document structure

     *

     * @return Document

     */

    public Document getPDFTextOutline() {

        return decode_pdf.getOutlineAsXML();

    }

}