Java PDF example code - ExtractTextAsWordlist.java


/*

 * Copyright (c) 1997-2025 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.text;



import org.jpedal.PdfDecoderServer;

import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.fonts.FontMappings;

import org.jpedal.utils.LogWriter;

import org.jpedal.utils.Strip;



import java.awt.Rectangle;

import java.io.File;

import java.io.FileOutputStream;

import java.io.OutputStreamWriter;

import java.nio.charset.StandardCharsets;

import java.util.Iterator;

import java.util.List;



/**

 * <h2>Extract words and locations from PDF files</h2>

 * <p>

 * This class provides a simple Java API to extract text as words and the

 * location on the page from a PDF file and also a static convenience method if

 * you just want to dump all the word lists from a PDF file or directory

 * containing PDF files

 * <p>

 * <a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction.</a>

 */

public class ExtractTextAsWordlist extends BaseTextExtraction {



    /**

     * word count - used for testing

     */

    private int wordsExtracted;



    /**

     * Default delimiters used to discern the boundary of words in page content

     */

    public static final String DEFAULT_DELIMITERS = "&:=()!;.,\\/\"\"''";



    /**

     * Sets up an ExtractTextAsWordlist instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractTextAsWordlist(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractTextAsWordlist instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ExtractTextAsWordlist(final byte[] byteArray) {

        super(byteArray);



        init();

    }



    /**

     * routine to decode a file

     */

    @Override

    public void decodeFile(final String file_name) throws PdfException {



        fileName = file_name;

        if (!openPDFFile()) {

            return;

        }



        /*get just the name of the file without

         * the path to use as a sub-directory or .pdf

         */

        String name = "demo"; //set a default just in case



        final int pointer = file_name.lastIndexOf(separator);



        if (pointer != -1) {

            name = file_name.substring(pointer + 1, file_name.length() - 4);

        }



        /*

         * create output dir for text

         */

        final String outputDir = this.outputDir + separator + name + separator;



        //page range

        final int start = 1;

        int end = getPageCount();



        //limit to 1st ten pages in testing

        if (end > 10 && maxCount > 0 && end > maxCount) {

            end = maxCount;

        }



        try {

            for (int page = start; page < end + 1; page++) { //read pages



                selectPage(page);



                final List<String> words = getWordsOnPage(page, DEFAULT_DELIMITERS);



                if (words == null) {

                    continue;

                }



                //create a directory if it doesn't exist

                final File output_path = new File(outputDir);

                if (!output_path.exists()) {

                    output_path.mkdirs();

                }



                /*each word is stored as 5 consecutive values (word,x1,y1,x2,y2)*/

                final int wordCount = words.size() / 5;



                //update our count

                wordsExtracted += wordCount;





                try (OutputStreamWriter output_stream = new OutputStreamWriter(

                        new FileOutputStream(outputDir + "words-" + page + ".txt"),

                        StandardCharsets.UTF_8

                )) {



                    final Iterator<String> wordIterator = words.iterator();

                    while (wordIterator.hasNext()) {



                        String currentWord = wordIterator.next();



                        /*remove the XML formatting if present - not needed for pure text*/

                        currentWord = Strip.convertToText(currentWord, decode_pdf.isXMLExtraction());



                        /*

                         * these co-ordinates are absolute from the bottom of the page (MediaBox)

                         * If you are extracting image (which may use crop, use need to modify as below

                         */

                        final int wx1 = (int) Float.parseFloat(wordIterator.next());

                        final int wy1 = (int) Float.parseFloat(wordIterator.next());

                        final int wx2 = (int) Float.parseFloat(wordIterator.next());

                        final int wy2 = (int) Float.parseFloat(wordIterator.next());



                        /*this could be inserting into a database instead*/

                        output_stream.write(currentWord + ',' + wx1 + ',' + wy1 + ',' + wx2 + ',' + wy2 + '\n');

                    }

                }

            }



            //remove data once written out

            decode_pdf.flushObjectValues(false);

        } catch (final Exception e) {

            LogWriter.error(e, "Exception thrown while extracting wordlist from " + file_name);

            throw new PdfException(e.getMessage(), e);

        }

    }



    /**

     * Gets the individual words from the pages text content and returns them.

     * Uses a default set of delimiters to determine word bounds.

     *

     * @param page The page to get text content from.

     * @return List object containing all words found on the page.

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public List<String> getWordsOnPage(final int page) throws PdfException {



        checkFileOpened();



        selectPage(page);



        final int x1 = currentPageData.getMediaBoxX(page);

        final int x2 = currentPageData.getMediaBoxWidth(page) + x1;



        final int y2 = currentPageData.getMediaBoxX(page);

        final int y1 = currentPageData.getMediaBoxHeight(page) - y2;



        return getWordsOnPage(page, x1, y1, x2, y2, DEFAULT_DELIMITERS);





    }



    /**

     * Gets the individual words from the pages text content and returns them.

     * Uses the provided delimiters to determine word bounds.

     *

     * @param page       The page to get text content from.

     * @param delimiters A String of characters to be used as delimiters for words.

     * @return List object containing all words found on the page.

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public List<String> getWordsOnPage(final int page, final String delimiters) throws PdfException {



        checkFileOpened();



        selectPage(page);



        final int x1 = currentPageData.getMediaBoxX(page);

        final int x2 = currentPageData.getMediaBoxWidth(page) + x1;



        final int y2 = currentPageData.getMediaBoxX(page);

        final int y1 = currentPageData.getMediaBoxHeight(page) - y2;



        return getWordsOnPage(page, x1, y1, x2, y2, delimiters);





    }



    /**

     * Gets the individual words from the pages text content with a greater degree of control.

     *

     * @param page The page to get text content from.

     * @param x1   The left most point to extract from.

     * @param y1   The top most point to extract from.

     * @param x2   The right most point to extract from.

     * @param y2   The bottom most point to extract from.

     * @param delimiters key to separate values

     * @return List object containing all words found on the page.

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public List<String> getWordsOnPage(final int page, final int x1, final int y1, final int x2, final int y2, final String delimiters) throws PdfException {



        checkFileOpened();



        selectPage(page);



        return currentGrouping.extractTextAsWordlist(

                x1,

                y1,

                x2,

                y2,

                page,

                true, delimiters);



    }



    /**

     * Gets the individual words from the pages text content with a greater degree of control.

     *

     * @param page The page to get text content from.

     * @param rectangle   Rectangle area on the page to extract words from.

     * @param delimiters separator used for output

     * @return List object containing all words found on the page.

     * @throws PdfException if problem with parsing and extraxting text from PDF file

     */

    public List<String> getWordsOnPage(final int page, final Rectangle rectangle, final String delimiters) throws PdfException {



        checkFileOpened();



        selectPage(page);



        return currentGrouping.extractTextAsWordlist(

                rectangle.x,

                rectangle.y,

                rectangle.x + rectangle.width,

                rectangle.y + rectangle.height,

                page,

                true, delimiters);



    }



    /**

     * This class will allow you to extract any Words from page as a list via command line from a single PDF file or a directory of PDF files.

     * <br>

     * The example expects two:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is directory to write out the outline data</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        final int len = args.length;

        switch (len) {

            case 0 -> {

                System.out.println("Example takes 2 parameters");

                System.out.println("Value 1 is the file name or directory of PDF files to process");

                System.out.println("Value 2 is Directory for writing the data as text files");

            }

            case 2 -> {

                try {

                    writeAllWordlistsToDir(args[0], args[1], -1);

                } catch (final PdfException e) {

                    LogWriter.error(e, "Exception thrown while extracting wordlist");

                }

            }

            default -> {

                System.out.println("too many arguments entered - run with no values to see defaults");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("you entered:\n" + arguments + "as the arguments");

            }

        }

    }



    @Override

    void init() {

        //PdfDecoder returns a PdfException if there is a problem

        decode_pdf = new PdfDecoderServer(true);

        //incase fonts not embedded

        FontMappings.setFontReplacements();

        decode_pdf.setExtractionMode(PdfDecoderServer.TEXT); //extract just text

        PdfDecoderServer.init(true);

        //if you do not require XML content, pure text extraction

        decode_pdf.useTextExtraction();

    }



    /**

     * Convenience method to write all the Wordlists in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for pdf file

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to the first pages up to this page

     * @return count of words extracted in total

     * @throws org.jpedal.exception.PdfException if problem with parsing and extraxting text from PDF file

     */

    public static int writeAllWordlistsToDir(final String inputDir, final String password, final String outputDir, final int maxPages) throws PdfException {

        return writeAllWordlistsToDir(inputDir, password, outputDir, maxPages, null);

    }



    /**

     * Convenience method to write all the Wordlists in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param password user or owner password for pdf file

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to the first pages up to this page

     * @param errorTracker a custom error tracker

     * @return count of words extracted in total

     * @throws org.jpedal.exception.PdfException if problem with parsing and extraxting text from PDF file

     */

    public static int writeAllWordlistsToDir(final String inputDir, final String password, final String outputDir,

            final int maxPages, final ErrorTracker errorTracker) throws PdfException {

        final ExtractTextAsWordlist extract = new ExtractTextAsWordlist(inputDir);



        if (password != null) {

            extract.setPassword(password);

        }



        if (errorTracker != null) {

            extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

        }



        extract.setup(outputDir, maxPages);



        extract.processFiles(inputDir);



        extract.closePDFfile();



        return extract.wordsExtracted;

    }



    /**

     * Convenience method to write all the Wordlists in a directory of PDF files

     *

     * @param inputDir  directory containing PDF files

     * @param outputDir directory for writing out wordlists

     * @param maxPages limit to just the first maxPages of a document

     * @return count of number of words extracted

     * @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file

     */

    public static int writeAllWordlistsToDir(final String inputDir, final String outputDir, final int maxPages) throws PdfException {

        return writeAllWordlistsToDir(inputDir, null, outputDir, maxPages, null);

    }



    private void setup(String outputDir, final int maxCount) {



        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        this.outputDir = outputDir;

        this.maxCount = maxCount;

    }

}