Java PDF example code - ConvertPagesToHiResImages.java


/*

 * Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)

 */

package org.jpedal.examples.images;



import com.idrsolutions.image.utility.SupportedFormats;

import org.jpedal.PdfDecoderServer;

import org.jpedal.color.ColorSpaces;

import org.jpedal.constants.JPedalSettings;

import org.jpedal.constants.PageInfo;

import org.jpedal.exception.PdfException;

import org.jpedal.fonts.FontMappings;

import org.jpedal.io.ColorSpaceConvertor;

import org.jpedal.utils.LogWriter;



import java.awt.image.BufferedImage;

import java.io.File;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Map;



/**

 * <h2>Image Extraction from PDF files</h2>

 * <br>

 * This class provides a simple Java API to convert pages in a PDF files into images and also

 * a static convenience method if you just want to dump all the pages as images from a PDF file

 * or directory containing PDF files<br>

 * <br>

 * See our Support Page for  <a href="https://support.idrsolutions.com/jpedal/tutorials/convert-images/convert-pdf-to-bufferedimage"> Examples on Convert PDF pages to Images</a> <br>

 * There is a simpler example <a href="https://javadoc.idrsolutions.com/org/jpedal/examples/images/ConvertPagesToImages.html"> (org.jpedal.examples.images.ConvertPagesToImages)</a> for producing images of pages if extra features not needed<br>

 * <br>

 */

public final class ConvertPagesToHiResImages extends BaseImageExtraction {



    @SuppressWarnings("unused")

    public static void main(final String[] args) {



        if (args != null && args.length > 1) {



            try {

                if (args.length == 2) {

                    writeAllPagesAsHiResImagesToDir(args[0], args[1], "png");

                } else {

                    writeAllPagesAsHiResImagesToDir(args[0], args[1], args[2]);

                }

            } catch (final PdfException ex) {

                throw new RuntimeException(ex.getMessage());

            }

        } else if (args == null) {

            System.out.println("null arguments entered");



        } else {

            System.out.println("wrong arguments entered");



            final StringBuilder arguments = new StringBuilder();

            for (final String arg : args) {

                arguments.append(arg).append('\n');

            }

            System.out.println("you entered:\n" + arguments + "as the arguments");

        }

    }



    /**

     * static method to write out all pages in a PDF files or directory of PDF files as images

     * Not for use with other image conversion methods in multi-threaded environments.

     * This method utilises some variables that may impact image conversion taking place on other threads.

     *

     * @param inputDir directory of files to convert

     * @param outputDir directory of output

     * @param format format of images

     * @throws org.jpedal.exception.PdfException PdfException

     */

    public static void writeAllPagesAsHiResImagesToDir(final String inputDir, final String outputDir, final String format) throws PdfException {



        /*

         * this process is very flaxible to we create a Map and pass in values to select what sort

         * of results we want. There is a choice between methods used and image size. Larger images use more

         * memory and are slower but look better

         */

        final Map<Integer, Object> mapValues = new HashMap<>();



        /* USEFUL OPTIONS*/

        //do not scale above this figure

        mapValues.put(JPedalSettings.EXTRACT_AT_BEST_QUALITY_MAXSCALING, 2);



        //alternatively secify a page size (aspect ratio preserved so will do best fit)

        //set a page size (JPedal will put best fit to this)

        mapValues.put(JPedalSettings.EXTRACT_AT_PAGE_SIZE, new String[]{"2000", "1600"});



        //which takes priority (default is false)

        mapValues.put(JPedalSettings.PAGE_SIZE_OVERRIDES_IMAGE, Boolean.TRUE);



        writeAllPagesAsHiResImagesToDir(inputDir, outputDir, format, mapValues);



    }



    /**

     * static method to write out all pages in a PDF files or directory of PDF files as images

     * Not for use with other image conversion methods in multi-threaded environments.

     * This method utilises some variables that may impact image conversion taking place on other threads.

     *

     * @param inputDir directory of files to convert

     * @param outputDir directory of output

     * @param format format of images

     * @param mapValues Map of KeyPair values from <a href="https://files.idrsolutions.com/maven/site/jpedal/apidocs/org/jpedal/constants/JPedalSettings.html">JPedalSettings</a> <br>

     * @throws org.jpedal.exception.PdfException PdfException

     */

    public static void writeAllPagesAsHiResImagesToDir(final String inputDir, final String outputDir, final String format, final Map<Integer, Object> mapValues) throws PdfException {



        if (SupportedFormats.hasEncoderSupportForImageFormat(format)) {



            PdfDecoderServer.modifyJPedalParameters(mapValues);



            final ConvertPagesToHiResImages convert = new ConvertPagesToHiResImages(inputDir);



            convert.setup(format, outputDir);



            convert.processFiles(inputDir);



            convert.closePDFfile();

        } else {

            throw new RuntimeException("Unknown image format - " + format);

        }

    }



    /**

     * Sets up an ConvertPagesToHiResImages instance to open a PDF File

     *

     * @param fileName full path to a single PDF file

     */

    public ConvertPagesToHiResImages(final String fileName) {

        super(fileName);



        init();

    }



    /**

     * Sets up an ConvertPagesToHiResImages instance to open  a PDF file contained as a BLOB within a byte[] stream

     *

     * @param byteArray pdf file data

     */

    public ConvertPagesToHiResImages(final byte[] byteArray) {

        super(byteArray);



        init();



    }



    /**

     * main constructor to convert PDF to img

     *

     * @param pdfFile reference to pdf file data

     * @throws PdfException if problem with parsing PDF file

     */

    @Override

    void decodeFile(final String pdfFile) throws PdfException {



        if (openPDFFile()) {



            
            /*

             * allow output to multiple images with different values on each

             *

             * Note we REMOVE shapes as it is a new feature and we do not want to break existing functions

             */

            final String separation = System.getProperty("org.jpedal.separation");

            if (separation != null) {



                Object[] sepValues = {7, "", Boolean.FALSE}; //default of normal

                if ("all".equals(separation)) {

                    sepValues = new Object[]{PdfDecoderServer.RENDERIMAGES, "image_and_shapes", Boolean.FALSE,

                            PdfDecoderServer.RENDERIMAGES + PdfDecoderServer.REMOVE_RENDERSHAPES, "image_without_shapes", Boolean.FALSE,

                            PdfDecoderServer.RENDERTEXT, "text_and_shapes", Boolean.TRUE,

                            7, "all", Boolean.FALSE,

                            PdfDecoderServer.RENDERTEXT + PdfDecoderServer.REMOVE_RENDERSHAPES, "text_without_shapes", Boolean.TRUE

                    };

                }



                final int sepCount = sepValues.length;

                for (int seps = 0; seps < sepCount; seps += 3) {



                    decode_pdf.setRenderMode((Integer) sepValues[seps]);



                    extractPagesAsImages(output_dir, imageType, "_" + sepValues[seps + 1], (Boolean) sepValues[seps + 2]); //boolean makes last transparent so we can see white text



                }



            } else { //just get the page

                    extractPagesAsImages(output_dir, imageType, "", false);

            }

        }

    }



    /**

     * actual conversion of a PDF page into an image

     *

     * @param fileType image output format

     * @param outputPath location for output of image

     * @param prefix file name prefix

     * @param isTransparent flag to show if image is transparent

     * @throws PdfException if problem with extracting images from PDF file

     */

    private void extractPagesAsImages(final String outputPath, final String fileType, final String prefix, final boolean isTransparent) throws PdfException {



        //create a directory if it doesn't exist

        final File output_path = new File(output_dir);

        if (!output_path.exists()) {

            output_path.mkdirs();

        }



        //page range

        final int start = 1;

        final int end = getPageCount();



        /*

         * set of JVM flags which allow user control on process

         */





        //////////////////TIFF OPTIONS/////////////////////////////////////////



        final String multiPageFlag = System.getProperty("org.jpedal.multipage_tiff");

        final boolean isSingleOutputFile = "true".equalsIgnoreCase(multiPageFlag);



        final String tiffFlag = System.getProperty("org.jpedal.compress_tiff");

        final boolean compressTiffs = "true".equalsIgnoreCase(tiffFlag);



        setJPEGCompression();



        ///////////////////////////////////////////////////////////////////////



        for (int pageNo = start; pageNo < end + 1; pageNo++) {



            /*

             * If you are using decoder.getPageAsHiRes() after passing additional parameters into JPedal using the static method

             * PdfDecoder.modifyJPedalParameters(), then getPageAsHiRes() wont necessarily be thread safe.  If you want to use

             * getPageAsHiRes() and pass in additional parameters, in a thread safe mannor, please use the method

             * getPageAsHiRes(int pageIndex, Map params) or getPageAsHiRes(int pageIndex, Map params, boolean isTransparent) and

             * pass the additional parameters in directly to the getPageAsHiRes() method without calling PdfDecoder.modifyJPedalParameters()

             * first.

             *

             * Please see org/jpedal/examples/images/ConvertPagesToImages.java.html for more details on how to use HiRes image conversion

             */

            BufferedImage imageToSave = getPageAsHiResImage(pageNo, isTransparent, null);



            decode_pdf.flushObjectValues(true);



            //image needs to be sRGB for JPEG

            if ("jpg".equals(fileType)) {

                imageToSave = ColorSpaceConvertor.convertToRGB(imageToSave);

            }



            final String outputFileName;

            if (isSingleOutputFile) {

                outputFileName = outputPath + "allPages" + prefix + '.' + fileType;

            } else {

                /*

                 * create a name with zeros for if more than 9 pages appears in correct order

                 */

                outputFileName = outputPath + "page" + getPageName(end, pageNo) + prefix + '.' + fileType;

            }



            //if just gray we can reduce memory usage by converting image to Grayscale



            /*

             * see what Colorspaces used and reduce image if appropriate

             * (only does Gray at present)

             *

             * Can return null value if not sure

             */

            final Iterator<Integer> colorspacesUsed = decode_pdf.getPageInfo(PageInfo.COLORSPACES);



            int nextID;

            boolean isGrayOnly = colorspacesUsed != null; //assume true and disprove



            while (colorspacesUsed != null && colorspacesUsed.hasNext()) {

                nextID = colorspacesUsed.next();



                if (nextID != ColorSpaces.DeviceGray && nextID != ColorSpaces.CalGray) {

                    isGrayOnly = false;

                }

            }



            //draw onto GRAY image to reduce colour depth

            //(converts ARGB to gray)

            if (isGrayOnly) {

                final BufferedImage image_to_save2 = new BufferedImage(imageToSave.getWidth(), imageToSave.getHeight(), BufferedImage.TYPE_BYTE_GRAY);

                image_to_save2.getGraphics().drawImage(imageToSave, 0, 0, null);

                imageToSave = image_to_save2;

            }



            //we save the image out here

            if (imageToSave != null) {

                try {

                    saveImage(fileType, isSingleOutputFile, compressTiffs, pageNo, imageToSave, outputFileName);

                } catch (final Exception e) {

                    LogWriter.writeLog("Unable to write out image " + e.getMessage());

                }



                imageToSave.flush();

            }

        }

    }



    private static StringBuilder getPageName(final int end, final int pageNo) {

        final StringBuilder pageAsString = new StringBuilder(String.valueOf(pageNo));

        final String maxPageSize = String.valueOf(end);

        final int padding = maxPageSize.length() - pageAsString.length();

        for (int ii = 0; ii < padding; ii++) {

            pageAsString.insert(0, '0');

        }

        return pageAsString;

    }



    public BufferedImage getPageAsHiResImage(final int page, final boolean isTransparent, final Map<Integer, Object> options) throws PdfException {



        return decode_pdf.getPageAsHiRes(page, options, isTransparent);



    }



    private void setup(final String format, String outputDir) {



        //check output dir has separator

        if (!outputDir.endsWith(separator)) {

            outputDir += separator;

        }



        imageType = format;

        output_dir = outputDir;

    }



    @Override

    void init() {



        //mappings for non-embedded fonts to use

        FontMappings.setFontReplacements();



        type = ExtractTypes.RASTERIZED_PAGE;



        super.init();

    }

}