Java PDF example code - ExtractImages.java


/*

 * Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.images;



import com.idrsolutions.image.JDeli;

import com.idrsolutions.image.tiff.TiffEncoder;

import com.idrsolutions.image.tiff.options.TiffCompressionFormat;

import com.idrsolutions.image.utility.SupportedFormats;

import org.jpedal.exception.PdfException;

import org.jpedal.external.ErrorTracker;

import org.jpedal.external.Options;

import org.jpedal.objects.PdfImageData;

import org.jpedal.utils.LogWriter;

import org.jpedal.utils.SecureDocumentBuilderFactory;

import org.jpedal.utils.SecureTransformerFactory;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

import org.w3c.dom.Node;



import javax.xml.parsers.DocumentBuilder;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import javax.xml.transform.stream.StreamSource;

import java.awt.image.BufferedImage;

import java.io.File;

import java.io.FileOutputStream;

import java.io.InputStream;



/**

 * <h2>Image Extraction from PDF files</h2>

 * <br>

 * This class provides a simple Java API to extract images from a PDF file and also

 * a static convenience method if you just want to dump all the images from a PDF file

 * or directory containing PDF files.<br>

 * <br>

 *<a href="https://support.idrsolutions.com/jpedal/tutorials/extract-images/extract-images-from-pdf">See our Support Pages for more info on Image Extraction.</a>

 */

public class ExtractImages extends BaseImageExtraction {



    private boolean outputPagesInSeparateDirs = true;



    private String defaultOutputDir;



    private boolean writeOutMetadata = true;



    /**

     * Sets up an ExtractImages instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ExtractImages(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ExtractImages instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ExtractImages(final byte[] byteArray) {

        super(byteArray);



        init();

    }



    /**

     * Convenience method to Extract all the images in a directory of PDF files

     *

     * @param inputDir             directory containing PDF files

     * @param password             password used to open PDF files

     * @param outputDir            directory for writing out images

     * @param imageType 3 letter value for image format to be used

     * @param generateMetaData     if true include additional XML file with metadata on image

     * @param outputPagesInSepDirs if true place images from each page in separate sub-directory

     * @throws org.jpedal.exception.PdfException if problem with processing PDF files

     */

    public static void writeAllImagesToDir(final String inputDir, final String password, final String outputDir, final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs) throws PdfException {

        writeAllImagesToDir(inputDir, password, outputDir, imageType, generateMetaData, outputPagesInSepDirs, null);

    }



    /**

     * Convenience method to Extract all the images in a directory of PDF files

     *

     * @param inputDir             directory containing PDF files

     * @param outputDir            directory for writing out images

     * @param imageType 3 letter value for image format to be used

     * @param generateMetaData     if true include additional XML file with metadata on image

     * @param outputPagesInSepDirs if true place images from each page in separate sub-directory

     * @throws org.jpedal.exception.PdfException if problem with processing PDF files

     */

    public static void writeAllImagesToDir(final String inputDir, final String outputDir, final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs) throws PdfException {

        writeAllImagesToDir(inputDir, null, outputDir, imageType, generateMetaData, outputPagesInSepDirs, null);

    }



    /**

     * Convenience method to Extract all the images in a directory of PDF files

     *

     * @param inputDir             directory containing PDF files

     * @param password             password used to open PDF files

     * @param outputDir            directory for writing out images

     * @param imageType 3 letter value for image format to be used

     * @param generateMetaData     if true include additional XML file with metadata on image

     * @param outputPagesInSepDirs if true place images from each page in separate sub-directory

     * @param errorTracker         a custom error tracker

     * @throws org.jpedal.exception.PdfException if problem with processing PDF files

     */

    public static void writeAllImagesToDir(final String inputDir, final String password, final String outputDir,

            final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs, final ErrorTracker errorTracker) throws PdfException {



        if (SupportedFormats.hasEncoderSupportForImageFormat(imageType)) {

            final ExtractImages extract = new ExtractImages(inputDir);



            if (password != null) {

                extract.setPassword(password);

            }



            if (errorTracker != null) {

                extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);

            }



            extract.setup(outputDir, imageType, generateMetaData, outputPagesInSepDirs);



            extract.processFiles(inputDir);



            extract.closePDFfile();

        } else {

            throw new RuntimeException("Unknown image format - " + imageType);

        }

    }



    @Override

    void decodeFile(final String fileName) throws PdfException {



        if (openPDFFile()) {



            //page range

            final int start = 1;

            final int end = getPageCount();



            /*

             * create output dir for images

             */

            if (defaultOutputDir == null) {

                output_dir = user_dir + "images" + separator + name + separator;

            } else {

                output_dir = defaultOutputDir;

            }



            //create a directory if it doesn't exist

            final File output_path = new File(output_dir);

            if (!output_path.exists()) {

                output_path.mkdirs();

            }





            for (int page = start; page < end + 1; page++) {



                //image count (note image 1 is item 0, so any loop runs 0 to count-1)

                final int image_count = getImageCount(page);



//tell user

                if (image_count > 0) {



                    //create a directory for page our put all in same dir

                    String target = output_dir;

                    if (outputPagesInSeparateDirs) {

                        target = output_dir + separator + page;

                    }



                    final File page_path = new File(target);

                    if (!page_path.exists()) {

                        page_path.mkdirs();

                    }

                }



                try {

                    writeImagesFromPage(image_count, page);

                } catch (final Exception ex) { // Cascade up

                    throw new PdfException(ex.getMessage(), ex);

                }



                // Flush images in case we do more than 1 page so only contains

                // images from current page

                decode_pdf.flushObjectValues(true);

            }

        }

    }



    private void writeImagesFromPage(final int image_count, final int page) throws Exception {



        BufferedImage image_to_save;



        String outputDir = output_dir;

        if (outputPagesInSeparateDirs) {

            outputDir = output_dir + page + separator;

        }



        // Work through and save each image

        for (int i = 0; i < image_count; i++) {





            final String image_name = getImageName(page, i);



            //get raw version of image (R imageType for raw image)

            image_to_save = getImage(page, image_name, false);



            saveImage(image_to_save, outputDir + 'R' + image_name + '_' + page + '.' + imageType, imageType);



            //load processed version of image (converted to rgb)

            image_to_save = getImage(page, image_name, true);



            //save image

            if (image_to_save != null) {

                saveImage(image_to_save, outputDir + image_name + '_' + page + '.' + imageType, imageType);

            }

            //save metadata as XML file

            if (writeOutMetadata) {

                outputMetaDataToXML(fileName, page, pdf_images, i, image_name);

            }

        }

    }



    @Override

    void init() {

        type = ExtractTypes.IMAGES;



        super.init();

    }



    /**

     * save image - different versions have different bugs for file formats so we use best for

     * each image type

     *

     * @param image_to_save extracted image

     */

    private static void saveImage(final BufferedImage image_to_save, final String fileName, final String prefix) throws Exception {



        if (prefix.contains("tif")) {



            final FileOutputStream os = new FileOutputStream(fileName);



            //get tiff compression

            final String tiffFlag = System.getProperty("org.jpedal.compress_tiff");

            final boolean compressTiffs = tiffFlag != null;



            final TiffEncoder tiffEncoder = new TiffEncoder();

            if (compressTiffs) {

                tiffEncoder.getEncoderOptions().setCompressionFormat(TiffCompressionFormat.DEFLATE);

            } else {

                tiffEncoder.getEncoderOptions().setCompressionFormat(TiffCompressionFormat.NONE);

            }

            tiffEncoder.write(image_to_save, os);



            os.flush();

            os.close();



        } else { //other images



            JDeli.write(image_to_save, prefix, new File(fileName));



        }

    }



    /**

     * write out details of image to XML file

     */

    private void outputMetaDataToXML(final String file_name, final int page, final PdfImageData pdf_images, final int i, final String image_name) {



        final float x1 = pdf_images.getImageXCoord(i);

        final float y1 = pdf_images.getImageYCoord(i);

        final float w = pdf_images.getImageWidth(i);

        final float h = pdf_images.getImageHeight(i);



        try {

            //create doc and set root

            final SecureDocumentBuilderFactory dbf = new SecureDocumentBuilderFactory();

            final DocumentBuilder db = dbf.newDocumentBuilder();

            final Document doc = db.newDocument();



            final Node root = doc.createElement("meta");

            doc.appendChild(root);



            //add comments

            final Node creation = doc.createComment("Created " + org.jpedal.utils.TimeNow.getShortTimeNow());

            doc.appendChild(creation);

            final Node info = doc.createComment("Pixel Location of image x1,y1,x2,y2");

            doc.appendChild(info);

            final Node moreInfo = doc.createComment("x1,y1 is top left corner origin is bottom left corner");

            doc.appendChild(moreInfo);



            //add location

            final Element location = doc.createElement("PAGELOCATION");

            location.setAttribute("x1", String.valueOf(x1));

            location.setAttribute("y1", String.valueOf((y1 + h)));

            location.setAttribute("x2", String.valueOf((x1 + w)));

            location.setAttribute("y2", String.valueOf(y1));

            root.appendChild(location);



            //add pdf file extracted from

            final Element fileName = doc.createElement("FILE");

            fileName.setAttribute("value", file_name);

            root.appendChild(fileName);



            //write out

            final Transformer transformer;

            try (InputStream stylesheet = getClass().getResourceAsStream("/org/jpedal/examples/text/xmlstyle.xslt")) {



                final TransformerFactory transformerFactory = SecureTransformerFactory.newInstance();

                transformer = transformerFactory.newTransformer(new StreamSource(stylesheet));

            }

            String outputDir = output_dir;

            if (outputPagesInSeparateDirs) {

                outputDir = output_dir + page + separator;

            }

            transformer.transform(new DOMSource(doc), new StreamResult(outputDir + image_name + ".xml"));



        } catch (final Exception e) {

            LogWriter.writeLog(e);

        }

    }



    //////////////////////////////////////////////////////////////////////////



    /**

     * This class will allow you to extract Images via command line from a single PDF file or a directory of PDF files.

     * <br>

     * The example expects three parameters:

     * <ul>

     * <li>Value 1 is the file name or directory of PDF files to process</li>

     * <li>Value 2 is directory to write out the images</li>

     * <li>Value 3 is image type (jpeg,tiff,png). Default is png</li>

     * </ul>

     *

     * @param args The expected arguments are described above.

     */

    @SuppressWarnings("unused")

    public static void main(final String[] args) {

        //check user has passed us a filename, output location and image type

        final int len = args.length;



        if (len != 3) {

            System.out.println("Class takes 3 parameters: ");

            System.out.println("Value 1 is the file name or directory of PDF files to process");

            System.out.println("Value 2 is Directory for writing the images");

            System.out.println("Value 3 is image type (jpeg,tiff,png).");



            if (len > 3) {

                System.out.println("\nToo many arguments entered");



                final StringBuilder arguments = new StringBuilder();

                for (final String arg : args) {

                    arguments.append(arg).append('\n');

                }

                System.out.println("You entered:\n" + arguments);

            }

        } else {

            try {

                writeAllImagesToDir(args[0], args[1], args[2], true, false);

            } catch (final PdfException e) {

                LogWriter.writeLog(e);

            }

        }

    }



    /**

     * extract any image from any page - recommended you process images on each page in turn as quicker

     *

     * @param page             logical page number (1 is first page)

     * @param imageNumber      image on page (0 is first image)

     * @param imageAsDisplayed if true return image as displayed (with scaling/rotation) otherwise use raw stored image (often but not always the same). Neither is clipped

     * @return BufferedImage

     * @throws PdfException if problem with extracting image from PDF file

     */

    public BufferedImage getImage(final int page, final int imageNumber, final boolean imageAsDisplayed) throws PdfException {



        checkFileOpened();



        return getImage(page, getImageName(page, imageNumber), imageAsDisplayed);

    }



    /**

     * extract any image from any page - recommended you process images on each page in turn as quicker

     *

     * @param page             logical page number (1 is first page)

     * @param image_name       name of image

     * @param imageAsDisplayed if true return image as displayed (with scaling/rotation) otherwise use raw stored image (often but not always the same). Neither is clipped

     * @return BufferedImage

     * @throws PdfException if problem with extracting image from PDF file

     */

    private BufferedImage getImage(final int page, final String image_name, final boolean imageAsDisplayed) throws PdfException {



        selectPage(page);



        if (imageAsDisplayed) {

            return decode_pdf.getObjectStore().loadStoredImage(image_name);

        } else {

            return decode_pdf.getObjectStore().loadStoredImage('R' + image_name);

        }

    }



    private void setup(String outputDir, final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs) {



        this.imageType = imageType;



        if (outputDir != null) {



            //check output dir has separator

            if (!outputDir.endsWith(separator)) {

                outputDir += separator;

            }

            defaultOutputDir = outputDir;

        }



        writeOutMetadata = generateMetaData;

        outputPagesInSeparateDirs = outputPagesInSepDirs;



    }



    /**

     * returns an image count for the selected page

     *

     * @param page logical page number

     * @return int number of images (0 if no images)

     * @throws PdfException if problem with opening PDF file

     */

    public int getImageCount(final int page) throws PdfException {



        checkFileOpened();



        selectPage(page);



        //image count (note image 1 is item 0, so any loop runs 0 to count-1)

        return pdf_images.getImageCount();

    }



    public PdfImageData getImageData(final int page) throws PdfException {

        checkFileOpened();



        selectPage(page);



        return pdf_images;

    }





    /**

     * Return name of image (composite of filename and Internal PDF image name)

     *

     * @param page        - logical page number

     * @param imageNumber - number of image (0 is first image)

     * @return - String containing image name

     * @throws PdfException if problem with extracting image from PDF file

     */

    private String getImageName(final int page, final int imageNumber) throws PdfException {



        checkFileOpened();



        selectPage(page);



        return pdf_images.getImageName(imageNumber);

    }

}