Java PDF example code - ExtractTextInRectangle.java


/*

 * Copyright (c) 1997-2026 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.text;



import org.jpedal.PdfDecoderServer;

import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.fonts.FontMappings;

import org.jpedal.utils.LogWriter;



import java.awt.Rectangle;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.nio.charset.Charset;

import java.util.Objects;



/**

 * <h2>Extract text from PDF files</h2>

 * <p>

 * This class provides a simple Java API to extract text from a PDF file

 * and also a static convenience method if you just want to dump all the text

 * from a PDF file or directory containing PDF files

 * <p>

 * <a href="https://www.idrsolutions.com/docs/jpedal/tutorials/extract-text/extract-unstructured-text-with-a-rectangle-from-pdf-files">See our Support Pages for more information on Text Extraction.</a>

 */

public class ExtractTextInRectangle extends BaseTextExtraction {



    /**

     * The available formats that text can be output as

     */

    public enum OUTPUT_FORMAT {

        XML, TXT

    }



    /** The format to output the text as */

    private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TXT;



    /** Whether to attempt to detect paragraphs and insert newlines into the output text */

    private boolean estimateParagraphs;



    /** When {@code true}, extract text content as plain text instead of XML */

    private final boolean extractPlainText;



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF File

     *

     * @param file path to a single PDF file

     */

    public ExtractTextInRectangle(final File file) {

        this(file.getAbsolutePath());

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF File

     *

     * @param file             path to a single PDF file

     * @param extractPlainText flag to extract plain text rather than XML

     */

    public ExtractTextInRectangle(final File file, final boolean extractPlainText) {

        this(file.getAbsolutePath(), extractPlainText);

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractTextInRectangle(final String fileName) {

        this(fileName, false);

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF File

     *

     * @param fileName         full path to a single PDF file

     * @param extractPlainText flag to extract plain text rather than XML

     */

    public ExtractTextInRectangle(final String fileName, final boolean extractPlainText) {

        super(fileName);



        this.extractPlainText = extractPlainText;



        init();

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ExtractTextInRectangle(final byte[] byteArray) {

        this(byteArray, false);

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     * @param extractPlainText flag to extract plain text rather than XML

     */

    public ExtractTextInRectangle(final byte[] byteArray, final boolean extractPlainText) {

        super(byteArray);



        this.extractPlainText = extractPlainText;



        init();

    }



    /**

     * Sets which output format to use, XML or TXT

     * @param format the output format to use

     */

    public void setOutputFormat(final OUTPUT_FORMAT format) {

        if (Objects.requireNonNull(format) == OUTPUT_FORMAT.XML) {

            decode_pdf.useXMLExtraction();

        } else {

            decode_pdf.useTextExtraction();

        }

        outputFormat = format;

    }



    public void setEstimateParagraphs(final boolean estimateParagraphs) {

        this.estimateParagraphs = estimateParagraphs;

    }



    /**

     * routine to decode a file

     */

    @Override

    public void decodeFile(final String file_name) throws PdfException {



        fileName = file_name;

        if (!openPDFFile()) {

            return;

        }



        String name = "demo"; //set a default just in case



        final int pointer = file_name.lastIndexOf(separator);



        if (pointer != -1) {

            name = file_name.substring(pointer + 1, file_name.length() - 4);

        }



        final String outputDir = this.outputDir + name + separator;



        //page range

        final int start = 1;

        int end = getPageCount();



        //limit to 1st ten pages in testing

        if (end > 10 && maxCount > 0 && end > maxCount) {

            end = maxCount;

        }



        try {

            for (int page = start; page < end + 1; page++) { //read pages

                decodePage(page, outputDir);

            }

        } catch (final Exception e) {

            throw new PdfException(e.getMessage(), e);

        }

    }



    private void decodePage(final int page, final String outputDir) throws PdfException, IOException {

        selectPage(page);



        // Co-ordinates are x1,y1 (lower left hand corner), x2,y2 (upper right)

        final int x1 = currentPageData.getMediaBoxX(page);

        final int x2 = currentPageData.getMediaBoxWidth(page) + x1;

        final int y2 = currentPageData.getMediaBoxY(page);

        final int y1 = currentPageData.getMediaBoxHeight(page) + y2;



        // text extracted by call

        final String text = getTextOnPage(page, x1, y1, x2, y2);



        if (text == null) {

            decode_pdf.flushObjectValues(false);

            return;

        }



        // ensure a directory for data

        final File page_path = new File(outputDir + separator);

        if (!page_path.exists() && !page_path.mkdirs()) {

            throw new IOException("Unable to create output directory - " + page_path.getAbsolutePath());

        }



        final String encoding = Charset.defaultCharset().displayName();

        final String prefix;

        if (Objects.requireNonNull(outputFormat) == OUTPUT_FORMAT.XML) {

            prefix = ".xml";

        } else {

            prefix = ".txt";

        }



        try {

            try (OutputStreamWriter output_stream = new OutputStreamWriter(

                    new FileOutputStream(outputDir + page + prefix),

                    encoding

            )) {

                if (outputFormat == OUTPUT_FORMAT.XML) {

                    output_stream.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");

                    output_stream.write("<meta>\n");

                    output_stream.write(

                            "    <PAGELOCATION x1=\""

                                    + x1

                                    + "\" "

                                    + "y1=\""

                                    + y1

                                    + "\" "

                                    + "x2=\""

                                    + x2

                                    + "\" "

                                    + "y2=\""

                                    + y2

                                    + "\" />\n");

                    output_stream.write("    <ESTIMATEPARAGRAPHS value=\"" + estimateParagraphs + "\"/>\n");

                    output_stream.write("    <FILE value=\"" + decode_pdf.getFileName() + "\"/>\n");

                    output_stream.write("</meta>\n");

                    output_stream.write("<TEXT>\n");

                    //NOTE DATA IS TECHNICALLY UNICODE

                    output_stream.write(text); //write actual data

                    output_stream.write("\n</TEXT>\n");

                } else {

                    output_stream.write(text); //write actual data

                }

            }

        } catch (final IOException e) {

            LogWriter.error(e, "Exception thrown while extracting text on page " + page + " from " + fileName);

        }



        //remove data once written out

        decode_pdf.flushObjectValues(false);

    }



    /**

     * extract all text on page as a string value.

     * <p>

     * If the page contains text with multiple orientations (Left to right,

     * bottom to top), only the most common orientation will be extracted and

     * others will be ignored

     *

     * @param page number (first page is 1)

     * @return String with text

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public String getTextOnPage(final int page) throws PdfException {



        checkFileOpened();



        selectPage(page);



        /*Co-ordinates are x1,y1 (lower left hand corner), x2,y2(upper right) */

        final int x1 = currentPageData.getMediaBoxX(page);

        final int x2 = currentPageData.getMediaBoxWidth(page) + x1;

        final int y2 = currentPageData.getMediaBoxY(page);

        final int y1 = currentPageData.getMediaBoxHeight(page) + y2;



        return currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, false, true);

    }



    /**

     * extract all text on page in a specified region as a string value. If the

     * page contains text with multiple orientations (Left to right, bottom to

     * top), only the most common orientation will be extracted and others will

     * be ignored

     *

     * @param page (first page is 1)

     * @param rectangle   - lower left corner, width, height

     * @return String with text

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public String getTextOnPage(final int page, final Rectangle rectangle) throws PdfException {



        checkFileOpened();



        selectPage(page);



        return currentGrouping.extractTextInRectangle(

                rectangle.x,

                rectangle.y,

                rectangle.x + rectangle.width,

                rectangle.y + rectangle.height,

                page,

                false,

                true);

    }



    /**

     * extract all text on page in a specified region as a string value.If the

     * page contains text with multiple orientations (Left to right, bottom to

     * top), only the most common orientation will be extracted and others will

     * be ignored

     *

     * @param page (first page is 1)

     * @param x1   - lower left corner x

     * @param y1   - lower left corner y

     * @param x2   - upper right corner x

     * @param y2   - upper right corner y

     * @return String with text

     * @throws PdfException if problem with parsing and extracting text from PDF file

     */

    public String getTextOnPage(final int page, final int x1, final int y1, final int x2, final int y2) throws PdfException {



        checkFileOpened();



        selectPage(page);



        return currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, estimateParagraphs, true);

    }



    /**

     * This class will allow you to extract all text from page via command line from a single PDF file or a directory of PDF files.

     * <br>

     * The example expects two:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is directory to write out the data</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        final int len = args.length;

        switch (len) {

            case 0 -> {

                System.out.println("Example takes 2 parameters");

                System.out.println("Value 1 is the file name or directory of PDF files to process");

                System.out.println("Value 2 is Directory for writing the data as text files");

            }

            case 2 -> {

                try {

                    writeAllTextToDir(args[0], args[1], -1);

                } catch (final PdfException e) {

                    LogWriter.error(e, "Exception thrown while extracting text");

                }

            }

            default -> {

                System.out.println("too many arguments entered - run with no values to see defaults");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("you entered:\n" + arguments + "as the arguments");

            }

        }

    }



    @Override

    void init() {

        decode_pdf = new PdfDecoderServer(false);

        FontMappings.setFontReplacements();

        decode_pdf.setExtractionMode(PdfDecoderServer.TEXT); //extract just text

        PdfDecoderServer.init(true);

        //make sure widths in data CRITICAL if we want to split lines correctly!!

        //Extract plain text rather than xml

        if (extractPlainText) {

            decode_pdf.useTextExtraction();

        }

    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @param format set the output format for the text content (TXT or XML)

     * @param estimateParagraphs set if JPedal should estimate paragraph spacing in output.

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir,

            final int maxPages, final OUTPUT_FORMAT format, final boolean estimateParagraphs) throws PdfException {

        writeAllTextToDir(inputDir, password, outputDir, maxPages, format, estimateParagraphs, null);

    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @param format set the output format for the text content (TXT or XML)

     * @param estimateParagraphs set if JPedal should estimate paragraph spacing in output.

     * @param errorTracker a custom error tracker

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir,

            final int maxPages, final OUTPUT_FORMAT format, final boolean estimateParagraphs, final ErrorTracker errorTracker) throws PdfException {



        final ExtractTextInRectangle extract = new ExtractTextInRectangle(inputDir);



        if (password != null) {

            extract.setPassword(password);

        }



        if (errorTracker != null) {

            extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

        }



        extract.setOutputFormat(format);



        extract.estimateParagraphs = estimateParagraphs;



        extract.setup(outputDir, maxPages);



        extract.processFiles(inputDir);



        extract.closePDFfile();



    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir, final int maxPages) throws PdfException {

        writeAllTextToDir(inputDir, password, outputDir, maxPages, OUTPUT_FORMAT.TXT, false, null);

    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String outputDir, final int maxPages) throws PdfException {

        writeAllTextToDir(inputDir, null, outputDir, maxPages);

    }



    private void setup(String outputDir, final int maxCount) {

        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        this.outputDir = outputDir;

        this.maxCount = maxCount;

    }



}