Java PDF example code - ExtractClippedImages.java


/*

 * Copyright (c) 1997-2025 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.images;



import com.idrsolutions.image.JDeli;

import com.idrsolutions.image.utility.SupportedFormats;

import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.io.ColorSpaceConvertor;

import org.jpedal.utils.LogWriter;



import java.awt.Color;

import java.awt.Graphics2D;

import java.awt.Image;

import java.awt.image.BufferedImage;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.nio.charset.StandardCharsets;

import java.util.ArrayList;



/**

 * <h2>Clipped Image Extraction from PDF files</h2>

 * <br>

 * This class provides a simple Java API to extract clipped images from a PDF file and also

 * a static convenience method if you just want to dump all the images from a PDF file

 * or directory containing PDF files at a set of sizes<br>

 *

 * <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-images/extract-clipped-images-from-pdf">See our support pages for more information on extracting images.</a>

 */

public class ExtractClippedImages extends BaseImageExtraction {



    /**

     * sizes to output at -1 means unchanged

     */

    private float[] outputSizes;



    /**

     * target directories for files

     */

    private String[] outputSizeDirectories;



    /**

     * sizes to output at -1 means unchanged

     */

    private float[] outputScales;



    /**

     * target directories for files

     */

    private String[] outputScaleDirectories;



    /**

     * background colour to add to JPEG

     */

    private static final Color backgroundColor = Color.WHITE;



    /**

     * Sets up an ExtractClippedImages instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractClippedImages(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractClippedImages instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ExtractClippedImages(final byte[] byteArray) {

        super(byteArray);



        init();



    }



    /**

     * extract any image from any page - recommended you process images on each page in turn as quicker

     *

     * @param page        logical page number (1 is first page)

     * @param imageNumber image on page (0 is first image)

     * @return BufferedImage

     * @throws PdfException PdfException

     */

    public BufferedImage getClippedImage(final int page, final int imageNumber) throws PdfException {



        return getClippedImage(page, getImageName(page, imageNumber));

    }



    /**

     * extract any image from any page - recommended you process images on each page in turn as quicker

     *

     * @param page       logical page number (1 is first page)

     * @param image_name name of image

     * @return BufferedImage

     * @throws PdfException PdfException

     */

    private BufferedImage getClippedImage(final int page, final String image_name) throws PdfException {



        selectPage(page);



        return decode_pdf.getObjectStore().loadStoredImage("CLIP_" + image_name);



    }



    /**

     * Convenience method to Extract all the images in a directory of PDF files

     *

     * @param inputDir directory of input files

     * @param password password to open PDF files

     * @param outDir directory of output files

     * @param imageType 3 letter value for image format to be used

     * @param subDirs sub directory of files

     * @param errorTracker a custom error tracker

     * @throws org.jpedal.exception.PdfException PdfException

     */

    public static void writeAllClippedImagesToDirs(final String inputDir, final String password, final String outDir,

            final String imageType, final String[] subDirs, final ErrorTracker errorTracker) throws PdfException {



        if (SupportedFormats.hasEncoderSupportForImageFormat(imageType)) {

            final ExtractClippedImages extract = new ExtractClippedImages(inputDir);



            if (password != null) {

                extract.setPassword(password);

            }



            if (errorTracker != null) {

                extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

            }



            extract.setup(outDir, imageType, subDirs);



            extract.processFiles(inputDir);



            extract.closePDFfile();

        } else {

            throw new RuntimeException("Unknown image format - " + imageType);

        }



    }



    /**

     * Convenience method to Extract all the images in a directory of PDF files

     *

     * @param inputDir directory of input files

     * @param password password to open PDF files

     * @param outDir directory of output files

     * @param imageType 3 letter value for image format to be used

     * @param subDirs sub directory of files

     * @throws org.jpedal.exception.PdfException PdfException

     */

    public static void writeAllClippedImagesToDirs(final String inputDir, final String password, final String outDir, final String imageType, final String[] subDirs) throws PdfException {

        writeAllClippedImagesToDirs(inputDir, password, outDir, imageType, subDirs, null);

    }



    /**

     * Convenience method to Extract all the images in a directory of PDF files

     *

     * @param inputDir directory of input files

     * @param outDir directory of output files

     * @param imageType 3 letter value for image format to be used

     * @param subDirs sub directory of files

     * @throws org.jpedal.exception.PdfException PdfException

     */

    public static void writeAllClippedImagesToDirs(final String inputDir, final String outDir, final String imageType, final String[] subDirs) throws PdfException {

        writeAllClippedImagesToDirs(inputDir, null, outDir, imageType, subDirs, null);



    }



    private void setup(String outDir, final String imageType, final String[] subDirs) {



        //check output dir has separator

        if (!outDir.endsWith(separator)) {

            outDir += separator;

        }



        this.imageType = imageType;



        //read output values

        final int outputCount = (subDirs.length) / 2;



        //read and create output directories

        final ArrayList<Float> sizes = new ArrayList<>();

        final ArrayList<String> sizeDirs = new ArrayList<>();

        final ArrayList<Float> scales = new ArrayList<>();

        final ArrayList<String> scaleDirs = new ArrayList<>();

        for (int i = 0; i < outputCount; i++) {



            String output = outDir + subDirs[1 + (i * 2)];



            if ((!output.endsWith("\\")) && (!output.endsWith("/"))) {

                output += separator;

            }



            final File dir = new File(output);

            if (!dir.exists()) {

                dir.mkdirs();

            }





            final String value = subDirs[(i * 2)];

            if (value.startsWith("x")) {

                scales.add(Float.parseFloat(value.substring(1)));

                scaleDirs.add(output);

            } else {

                sizes.add(Float.parseFloat(value));

                sizeDirs.add(output);

            }

        }





        outputSizes = new float[sizes.size()];

        outputSizeDirectories = new String[sizes.size()];

        for (int i = 0; i != sizes.size(); i++) {

            outputSizes[i] = sizes.get(i);

            outputSizeDirectories[i] = sizeDirs.get(i);

        }



        final int scalesCount;

        if (outputSizes.length > 0) {

            scalesCount = scales.size() + 1;

        } else {

            scalesCount = scales.size();

        }

        outputScales = new float[scalesCount];

        outputScaleDirectories = new String[scalesCount];

        for (int i = 0; i != scales.size(); i++) {

            outputScales[i] = scales.get(i);

            outputScaleDirectories[i] = scaleDirs.get(i);

        }



        if (outputSizes.length > 0) {

            outputScales[scales.size()] = 1;

            outputScaleDirectories[scales.size()] = null;

        }

    }



    /**

     * routine to decodeFile a PDF file

     */

    @Override

    public void decodeFile(final String file_name) throws PdfException {



        if (openPDFFile()) {



            //page range

            final int start = 1;

            final int end = getPageCount();



            try {

                 for (int page = start; page < end + 1; page++) { //read pages

                    for (int scaleIndex = 0; scaleIndex != outputScales.length; scaleIndex++) {

                        LogWriter.writeLog("Decoding Page " + page);



                        decode_pdf.getPdfPageData().setScalingValue(outputScales[scaleIndex]);



                        //image count (note image 1 is item 0, so any loop runs 0 to count-1)

                        final int image_count = getImageCount(page);



                        //tell user

                        if (image_count > 0) {

                            LogWriter.writeLog("page" + ' ' + page + "contains " + image_count + " images");

                        } else {

                            LogWriter.writeLog("No bitmapped images on page " + page);

                        }



                        LogWriter.writeLog("Writing out " + image_count + " images");



                        //location of images

                        final float[] x1 = new float[image_count];

                        final float[] y1 = new float[image_count];

                        final float[] w = new float[image_count];

                        final float[] h = new float[image_count];



                        final String[] image_name = new String[image_count];

                        final BufferedImage[] image = new BufferedImage[image_count];



                        //work through and get each image details

                        for (int i = 0; i < image_count; i++) {



                            image_name[i] = getImageName(page, i);



                            //we need some duplicates as we update some values on merge but still need originals at end

                            //so easiest just to store

                            x1[i] = pdf_images.getImageXCoord(i);

                            y1[i] = pdf_images.getImageYCoord(i);

                            w[i] = pdf_images.getImageWidth(i);

                            h[i] = pdf_images.getImageHeight(i);



                            image[i] = getClippedImage(page, image_name[i]);



                        }



                        //save each image

                        for (int i = 0; i < image_count; i++) {



                            if (image[i] != null) {

                                final String entry = "<PAGELOCATION x1=\"" + x1[i] + "\" "

                                        + "y1=\"" + (y1[i] + h[i]) + "\" "

                                        + "x2=\"" + (x1[i] + w[i]) + "\" "

                                        + "y2=\"" + (y1[i]) + "\" />\n";

                                if (outputScaleDirectories[scaleIndex] == null) {

                                    generateVersionsForSizes(file_name, page, entry, image[i], i, outputSizes.length);

                                } else {

                                    generateVersion(file_name, page, entry, image[i], i, outputScaleDirectories[scaleIndex], outputScales[scaleIndex], true);

                                }

                            }

                        }





                        //flush images in case we do more than 1 page so only contains

                        //images from current page

                        decode_pdf.flushObjectValues(true);

                    }

                }

            } catch (final Exception e) {

                decode_pdf.closePdfFile();

                LogWriter.error(e, "Exception thrown when extracting clipped images from " + file_name);

            }

        }



        decode_pdf.closePdfFile();



    }



    private void generateVersionsForSizes(final String file_name, final int page, final String s, final BufferedImage bufferedImage, final int i, final int outputCount) {



        for (int versions = 0; versions < outputCount; versions++) {

            generateVersion(file_name, page, s, bufferedImage, i, outputSizeDirectories[versions], outputSizes[versions], false);

        }

    }



    private void generateVersion(final String file_name, final int page, final String s, final BufferedImage bufferedImage, final int i, final String directory, final float size, final boolean scale) {



            try {

                //find out format image was saved in



                //load image (converted to rgb)

                BufferedImage image_to_save = bufferedImage;

                if (image_to_save == null) {

                    return;

                }



                int index = file_name.lastIndexOf('\\');

                if (index == -1) {

                    index = file_name.lastIndexOf('/');

                }

                if (index == -1) {

                    index = 0;

                }

                final String nameToUse = file_name.substring(index, file_name.length() - 4);

                final String outputName = directory + nameToUse + '_' + page + '_' + i;



                float scaling = 1;



                final int newHeight = image_to_save.getHeight();



                if (scale) {

                    scaling = size;

                } else {

                    if (size > 0) {

                        scaling = size / newHeight;

                    }

                    if (scaling > 1) {

                        scaling = 1;

                    }

                }



                if (!scale && scaling != 1) {



                    final Image scaledImage = image_to_save.getScaledInstance(-1, (int) size, BufferedImage.SCALE_SMOOTH);



                    image_to_save = new BufferedImage((int) (image_to_save.getWidth() * scaling), (int) size, BufferedImage.TYPE_INT_ARGB);



                    final Graphics2D g2 = image_to_save.createGraphics();



                    g2.drawImage(scaledImage, 0, 0, null);



                }



                //no transparency on JPEG so give background and draw on

                if (imageType.startsWith("jp") && !"jp2".equalsIgnoreCase(imageType)) {



                    final int iw = image_to_save.getWidth();

                    final int ih = image_to_save.getHeight();

                    final BufferedImage background = new BufferedImage(iw, ih, BufferedImage.TYPE_INT_RGB);



                    final Graphics2D g2 = (Graphics2D) background.getGraphics();

                    g2.setPaint(backgroundColor);

                    g2.fillRect(0, 0, iw, ih);



                    g2.drawImage(image_to_save, 0, 0, null);

                    image_to_save = background;



                }



                if (image_to_save.getType() == BufferedImage.TYPE_CUSTOM) {

                    image_to_save = ColorSpaceConvertor.convertToARGB(image_to_save);

                }



                try {

                    JDeli.write(image_to_save, imageType, new File(outputName + '.' + imageType));

                } catch (final IOException ex) {

                    LogWriter.writeLog("Exception in writing image " + ex);

                }



                //save an xml file with details

                try (OutputStreamWriter output_stream = new OutputStreamWriter(new FileOutputStream(outputName + ".xml"), StandardCharsets.UTF_8)) {



                    output_stream.write(

                            "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");

                    output_stream.write(

                            "<!-- Pixel Location of image x1,y1,x2,y2\n");

                    output_stream.write("(x1,y1 is top left corner)\n");

                    output_stream.write(

                            "(origin is bottom left corner)  -->\n");

                    output_stream.write("\n\n<META>\n");

                    output_stream.write(s);

                    output_stream.write("<FILE>" + file_name + "</FILE>\n");

                    if (scale) {

                        output_stream.write("<ORIGINALHEIGHT>" + (int) (image_to_save.getHeight() / size) + "</ORIGINALHEIGHT>\n");

                    } else {

                        output_stream.write("<ORIGINALHEIGHT>" + newHeight + "</ORIGINALHEIGHT>\n");

                    }

                    output_stream.write("<SCALEDHEIGHT>" + image_to_save.getHeight() + "</SCALEDHEIGHT>\n");

                    output_stream.write("<SCALING>" + scaling + "</SCALING>\n");

                    output_stream.write("</META>\n");

                } catch (final IOException e) {

                    LogWriter.error(e, "Exception thrown writing out details for clipped images");

                }

            } catch (final Exception ee) {

                LogWriter.writeLog("Exception " + ee + " in extracting images");

            }

    }





    /**

     * main routine which checks for any files passed and runs the demo

     *

     * @param args arguments

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {



        final String[] subDirs = validateInputValues(args);



        try {

            writeAllClippedImagesToDirs(args[0], args[1], args[2], subDirs);

        } catch (final PdfException e) {

            throw new RuntimeException(e);

        }



    }



    private static String[] validateInputValues(final String[] args) throws RuntimeException {



        final String[] subDirs;

        final String inputDir;  //rootDir containing files



        //exit and report if wrong number of values

        if (args.length >= 5 && (args.length % 2) == 1) {



            LogWriter.writeLog("Values read");

            LogWriter.writeLog("inputDir=" + args[0]);

            LogWriter.writeLog("type=" + args[1]);

            LogWriter.writeLog("Directory and height pair values" + args[3] + " <> " + args[4] + '<');



            inputDir = args[0];



            final int outputCount = (args.length - 3);



            subDirs = new String[outputCount];



            for (int i = 0; i < outputCount; i++) {

                LogWriter.writeLog(args[i + 3]);

                if (((i % 2) == 0) && (!args[i + 3].matches("((-|\\+|x)?[0-9]+(\\.[0-9]+)?)+"))) {

                    throw new RuntimeException("Invalid value: " + args[i + 3]);

                }

                subDirs[i] = args[i + 3];

            }



        } else if (((args.length - 3) % 2) == 1) {

            throw new RuntimeException("Value/Directory pairs invalid");

        } else {

            System.out.println("Requires");

            System.out.println("inputDir processedDir imageOutputType");

            System.out.println("height Directory (as many pairs as you like)");

            throw new RuntimeException("Not enough parameters passed to software");

        }



        final File pdf_file = new File(inputDir);



        if (!pdf_file.exists()) {

            throw new RuntimeException("Directory " + inputDir + " not found");

        }



        return subDirs;

    }



    @Override

    void init() {



        type = ExtractTypes.CLIPPED_IMAGES;



        super.init();

    }



    /**

     * returns an image count for the selected page

     *

     * @param page logical page number

     * @return int number of images (0 if no images)

     * @throws PdfException PdfException

     */

    public int getImageCount(final int page) throws PdfException {



        decode_pdf.flushObjectValues(true);



        try {

            //read pages



            //decode the page

            decode_pdf.decodePage(page);

        } catch (final Exception ex) { // Cascade up

            throw new PdfException(ex.getMessage(), ex);

        }



        // Get the PdfImages object which now holds the images.

        // Binary data is stored in a temp directory and we hold the

        // image name and other info in this object

        pdf_images = decode_pdf.getPdfImageData();



        //image count (note image 1 is item 0, so any loop runs 0 to count-1)

        return pdf_images.getImageCount();

    }





    /**

     * Return name of image (composite of filename and Internal PDF image name)

     *

     * @param page        - logical page number

     * @param imageNumber - number of image (0 is first image)

     * @return - String containing image name

     * @throws PdfException PdfException

     */

    private String getImageName(final int page, final int imageNumber) throws PdfException {



        selectPage(page);



        return pdf_images.getImageName(imageNumber);

    }

}