Java PDF example code - ExtractTextInRectangle.java


/*

 * Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.text;



import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.utils.LogWriter;



import java.awt.Rectangle;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStreamWriter;



/**

 * <h2>Extract text from PDF files</h2>

 * <br>

 * This class provides a simple Java API to extract text from a PDF file

 * and also a static convenience method if you just want to dump all the text

 * from a PDF file or directory containing PDF files<br>

 * <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction.</a><br>

 */

public class ExtractTextInRectangle extends BaseTextExtraction {



    /**

     * The available formats that text can be output as

     */

    public enum OUTPUT_FORMAT {

        XML, TXT

    }



    /** The format to output the text as */

    private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TXT;



    /** Whether to attempt to detect paragraphs and insert newlines into the output text */

    private boolean estimateParagraphs;



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractTextInRectangle(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open a PDF File

     *

     * @param fileName         full path to a single PDF file

     * @param extractPlainText flag to extract plain text rather than XML

     */

    public ExtractTextInRectangle(final String fileName, final boolean extractPlainText) {

        super(fileName, extractPlainText);



        init();

    }



    /**

     * Sets up an ExtractTextInRectangle instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ExtractTextInRectangle(final byte[] byteArray) {

        super(byteArray);



        init();

    }



    public void setOutputFormat(final OUTPUT_FORMAT format) {

        switch (format) {

            case XML:

                decode_pdf.useXMLExtraction();

                break;

            case TXT:

            default:

                decode_pdf.useTextExtraction();

        }

        outputFormat = format;

    }



    public void setEstimateParagraphs(final boolean estimateParagraphs) {

        this.estimateParagraphs = estimateParagraphs;

    }



    /**

     * routine to decode a file

     */

    @Override

    void decodeFile(final String file_name) throws PdfException {



        fileName = file_name;

        if (openPDFFile()) {



            String name = "demo"; //set a default just in case



            final int pointer = file_name.lastIndexOf(separator);



            if (pointer != -1) {

                name = file_name.substring(pointer + 1, file_name.length() - 4);

            }



            final String outputDir = output_dir + name + separator;



            //page range

            final int start = 1;

            int end = getPageCount();



            //limit to 1st ten pages in testing

            if (end > 10 && maxCount > 0 && end > maxCount) {

                end = maxCount;

            }



            try {

                for (int page = start; page < end + 1; page++) { //read pages

                    decodePage(page, outputDir);

                }

            } catch (final Exception e) {

                throw new PdfException(e.getMessage(), e);

            }

        }

    }



    private void decodePage(final int page, final String outputDir) throws PdfException, IOException {

        selectPage(page);



        /*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */

        final int x1 = currentPageData.getMediaBoxX(page);

        final int x2 = currentPageData.getMediaBoxWidth(page) + x1;

        final int y2 = currentPageData.getMediaBoxY(page);

        final int y1 = currentPageData.getMediaBoxHeight(page) + y2;



                    /*

                            text extracted by call

                     */

        final String text = getTextOnPage(page, x1, y1, x2, y2);



        if (text != null) {



            //ensure a directory for data

            final File page_path = new File(outputDir + separator);

            if (!page_path.exists() && !page_path.mkdirs()) {

                throw new IOException("Unable to create output directory - " + page_path.getAbsolutePath());

            }



            final String encoding = System.getProperty("file.encoding");

            final String prefix;

            switch (outputFormat) {

                case XML:

                    prefix = ".xml";

                    break;

                case TXT:

                default:

                    prefix = ".txt";

            }



            try {

                try (OutputStreamWriter output_stream = new OutputStreamWriter(

                        new FileOutputStream(outputDir + page + prefix),

                        encoding

                )) {

                    if (outputFormat == OUTPUT_FORMAT.XML) {

                        output_stream.write("<?xml version=\"1.1\" encoding=\"UTF-8\"?>\n");

                        output_stream.write("<meta>\n");

                        output_stream.write(

                                "    <PAGELOCATION x1=\""

                                        + x1

                                        + "\" "

                                        + "y1=\""

                                        + y1

                                        + "\" "

                                        + "x2=\""

                                        + x2

                                        + "\" "

                                        + "y2=\""

                                        + y2

                                        + "\" />\n");

                        output_stream.write("    <ESTIMATEPARAGRAPHS value=\"" + estimateParagraphs + "\"/>\n");

                        output_stream.write("    <FILE value=\"" + decode_pdf.getFileName() + "\"/>\n");

                        output_stream.write("</meta>\n");

                        output_stream.write("<TEXT>\n");

                        //NOTE DATA IS TECHNICALLY UNICODE

                        output_stream.write(text); //write actual data

                        output_stream.write("\n</TEXT>\n");

                    } else {

                        output_stream.write(text); //write actual data

                    }

                }

            } catch (final IOException e) {

                LogWriter.writeLog(e);

            }

        }



        //remove data once written out

        decode_pdf.flushObjectValues(false);

    }



    /**

     * extract all text on page as a string value.

     * <p>

     * If the page contains text with multiple orientations (Left to right,

     * bottom to top), only the most common orientation will be extracted and

     * others will be ignored

     *

     * @param page number (first page is 1)

     * @return String with text

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public String getTextOnPage(final int page) throws PdfException {



        checkFileOpened();



        selectPage(page);



        /*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */

        final int x1 = currentPageData.getMediaBoxX(page);

        final int x2 = currentPageData.getMediaBoxWidth(page) + x1;

        final int y2 = currentPageData.getMediaBoxY(page);

        final int y1 = currentPageData.getMediaBoxHeight(page) + y2;



        return currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, false, true);

    }





    /**

     * extract all text on page in a specified region as a string value. If the

     * page contains text with multiple orientations (Left to right, bottom to

     * top), only the most common orientation will be extracted and others will

     * be ignored

     *

     * @param page (first page is 1)

     * @param rectangle   - top left corner x

     * @return String with text

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public String getTextOnPage(final int page, final Rectangle rectangle) throws PdfException {





        checkFileOpened();



        selectPage(page);



        return currentGrouping.extractTextInRectangle(

                rectangle.x,

                rectangle.y,

                rectangle.x + rectangle.width,

                rectangle.y + rectangle.height,

                page,

                false,

                true);



    }



    /**

     * extract all text on page in a specified region as a string value.If the

     * page contains text with multiple orientations (Left to right, bottom to

     * top), only the most common orientation will be extracted and others will

     * be ignored

     *

     * @param page (first page is 1)

     * @param x1   - top left corner x

     * @param y1   - top left corner y

     * @param x2   - bottom right corner x

     * @param y2   - bottom right corner y

     * @return String with text

     * @throws PdfException if problem with parsing and extracting text from PDF file

     */

    /*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */

    public String getTextOnPage(final int page, final int x1, final int y1, final int x2, final int y2) throws PdfException {



        checkFileOpened();



        selectPage(page);



        return currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, estimateParagraphs, true);

    }





    /**

     * This class will allow you to extract all text from page via command line from a single PDF file or a directory of PDF files.

     * <br>

     * The example expects two:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is directory to write out the data</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        final int len = args.length;

        switch (len) {

            case 0:

                System.out.println("Example takes 2 parameters");

                System.out.println("Value 1 is the file name or directory of PDF files to process");

                System.out.println("Value 2 is Directory for writing the data as text files");

                System.exit(0);

            case 2:

                try {

                    writeAllTextToDir(args[0], args[1], -1);

                } catch (final PdfException e) {

                    LogWriter.writeLog(e);

                }

                break;

            default:

                System.out.println("too many arguments entered - run with no values to see defaults");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("you entered:\n" + arguments + "as the arguments");



                System.exit(0);

        }

    }



    @Override

    void init() {



        type = BaseTextExtraction.ExtractTypes.TEXT_IN_RECTANGLE;



        super.init();



        setOutputFormat(OUTPUT_FORMAT.TXT);

    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @param format set the output format for the text content (TXT or XML)

     * @param estimateParagraphs set if JPedal should estimate paragraph spacing in output.

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir,

            final int maxPages, final OUTPUT_FORMAT format, final boolean estimateParagraphs) throws PdfException {

        writeAllTextToDir(inputDir, password, outputDir, maxPages, format, estimateParagraphs, null);

    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @param format set the output format for the text content (TXT or XML)

     * @param estimateParagraphs set if JPedal should estimate paragraph spacing in output.

     * @param errorTracker a custom error tracker

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir,

            final int maxPages, final OUTPUT_FORMAT format, final boolean estimateParagraphs, final ErrorTracker errorTracker) throws PdfException {



        final ExtractTextInRectangle extract = new ExtractTextInRectangle(inputDir);



        if (password != null) {

            extract.setPassword(password);

        }



        if (errorTracker != null) {

            extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

        }



        extract.setOutputFormat(format);



        extract.estimateParagraphs = estimateParagraphs;



        extract.setup(outputDir, maxPages);



        extract.processFiles(inputDir);



        extract.closePDFfile();



    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir, final int maxPages) throws PdfException {

        writeAllTextToDir(inputDir, password, outputDir, maxPages, OUTPUT_FORMAT.TXT, false, null);

    }



    /**

     * Convenience method to write all the text in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static void writeAllTextToDir(final String inputDir, final String outputDir, final int maxPages) throws PdfException {

        writeAllTextToDir(inputDir, null, outputDir, maxPages);

    }



    private void setup(String outputDir, final int maxCount) {



        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        output_dir = outputDir;

        this.maxCount = maxCount;

    }





}