/*
* Copyright (c) 1997-2025 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.images;
import com.idrsolutions.image.tiff.TiffEncoder;
import com.idrsolutions.image.tiff.options.TiffCompressionFormat;
import com.idrsolutions.image.utility.SupportedFormats;
import org.jpedal.PdfDecoderServer;
import org.jpedal.color.ColorSpaces;
import org.jpedal.constants.PageInfo;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.fonts.FontMappings;
import org.jpedal.objects.PdfFileInformation;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.StringUtils;
import javax.print.attribute.standard.PageRanges;
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.stream.IntStream;
/**
* <h2>Image Extraction from PDF files</h2>
* <br>
* This class provides a simple Java API to extract pages as images from a PDF file and also
* a static convenience method if you just want to dump all the pages as images from a PDF file
* or directory containing PDF files
* isBackgroundTransparent <b>MUST</b> be false for generating JPEG images
* <br>
* <br>
* Note: It is recommended to resort to ConverPagesToHiResImages as the first choice for generating better quality images when converting with non-default settings.
* <br>
* See our Support Page for <a href="https://support.idrsolutions.com/jpedal/tutorials/convert-images/convert-pdf-to-bufferedimage"> Examples on Convert PDF pages to Images</a> <br>
* There is a simpler example <a href="https://javadoc.idrsolutions.com/org/jpedal/examples/images/ConvertPagesToHiResImages.html">(org.jpedal.examples.images.ConvertPagesToHiResImages)</a> for producing higher res images of pages (but likely to be slower).
*/
public class ConvertPagesToImages extends BaseImageExtraction {
/**
* use 96 dpi as default so pages correct size (72 will be smaller)
*/
private float pageScaling = 1.33f;
/**
* holding all creators that produce OCR pdf's
*/
private final String[] ocr = {"TeleForm"};
/**
* used as part of test to limit pages to first 10 - please do not use
*/
public static int maxPageCount = -1;
private int[] dimensions;
private int[][] range;
private Iterator<Integer> pages;
/**
* convenience static method to convert PDF file or directory of files
* @param inputDir directory of files to convert
* @param outDir directory of output
* @param format format of images
* @param pageScaling scaling
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllPagesAsImagesToDir(final String inputDir, final String outDir, final String format, final float pageScaling) throws PdfException {
writeAllPagesAsImagesToDir(inputDir, outDir, format, pageScaling, null, null);
}
/**
* convenience static method to convert PDF file with password or directory of files
* @param inputDir directory of files to convert
* @param outDir directory of output
* @param format format of images
* @param pageScaling scaling
* @param password to access PDF files
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllPagesAsImagesToDir(final String inputDir, final String outDir, final String format, final float pageScaling, final String password) throws PdfException {
writeAllPagesAsImagesToDir(inputDir, outDir, format, pageScaling, password, null);
}
/**
* convenience static method to convert PDF file with password or directory of files
* @param inputDir directory of files to convert
* @param outDir directory of output
* @param format format of images
* @param pageScaling scaling
* @param password to access PDF files
* @param errorTracker custom error tracker
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllPagesAsImagesToDir(final String inputDir, final String outDir, final String format,
final float pageScaling, final String password, final ErrorTracker errorTracker) throws PdfException {
if (SupportedFormats.hasEncoderSupportForImageFormat(format)) {
final ConvertPagesToImages convert = new ConvertPagesToImages(inputDir);
if (password != null) {
convert.setPassword(password);
}
if (errorTracker != null) {
convert.decode_pdf.addExternalHandler(errorTracker, Options.ErrorTracker);
}
convert.setup(format, outDir, pageScaling);
convert.processFiles(inputDir);
convert.closePDFfile();
} else {
throw new RuntimeException("Unknown image format - " + format);
}
}
/**
* convenience static method to convert PDF file or directory of files to a set output size
* @param inputDir directory of files to convert
* @param outDir directory of output
* @param format format of images
* @param dimensions int[]{width, height}
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllPagesAsImagesToDir(final String inputDir, final String outDir, final String format, final int[] dimensions) throws PdfException {
if (SupportedFormats.hasEncoderSupportForImageFormat(format)) {
final ConvertPagesToImages convert = new ConvertPagesToImages(inputDir);
convert.setFitToSize(dimensions);
convert.setup(format, outDir, 1); //note value 1 will be ignored
convert.processFiles(inputDir);
convert.closePDFfile();
} else {
throw new RuntimeException("Unknown image format - " + format);
}
}
/**
* convenience static method to convert PDF file with pageRanges
* See class SetOfIntegerSyntax for explanation of the syntax
*
* @param inputDir directory of file to convert
* @param outDir directory of output
* @param format format of images
* @param pageRange the range of pages we want to output
* @param pageScaling float of the scaling
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllPagesAsImagesToDir(final String inputDir, final String outDir, final String format, final String pageRange, final float pageScaling) throws PdfException {
if (SupportedFormats.hasEncoderSupportForImageFormat(format)) {
final ConvertPagesToImages convert = new ConvertPagesToImages(inputDir);
convert.setPageRange(new PageRanges(pageRange));
convert.setup(format, outDir, pageScaling);
convert.processFiles(inputDir);
convert.closePDFfile();
} else {
throw new RuntimeException("Unknown image format - " + format);
}
}
/**
* alter page scaling (default is 1.33f which gives same size as Acrobat at 100)
* <p>
* if setFitToSize(final int[] dimensions) is also set, this value will be ignored
*
* @param pageScaling scaling
*/
public void setPageScaling(final float pageScaling) {
this.pageScaling = pageScaling;
decode_pdf.setExtractionMode(0, pageScaling);
}
/**
* output image to fit a certain size (respecting aspect ratio)
* Value set will override any setPageScaling() setting
*
* @param dimensions int[]{width, height} in pixels
*/
public void setFitToSize(final int[] dimensions) {
if (dimensions == null || dimensions.length != 2) {
throw new RuntimeException("Invalid value for dimensions - expected int[]{width, height}");
}
this.dimensions = dimensions;
}
/**
* Set the page range
*
* @param r attribute to obtain the pages to convert.
*/
public void setPageRange(final PageRanges r) {
if (r != null) {
range = r.getMembers();
}
}
/**
* open the pdf file and set the page range if there is one
*
* @return boolean of if pdf is open
*/
@Override
public boolean openPDFFile() throws PdfException, RuntimeException {
final boolean open = super.openPDFFile();
final int count = getPageCount();
if (range != null) {
int start;
int end;
final ArrayList<Integer> rangeList = new ArrayList<>();
for (final int[] r : range) {
start = r[0];
end = r[1];
// checks start of the range does not exceed num of pages in the PDF
if (start > count) {
throw new RuntimeException("Page range: " + start + '-' + end + " exceeds the number of pages: " + count);
}
// checks end of range does not exceed num of pages in the PDF
if (end > count) {
System.err.print("Page range end: " + end + " exceeds the number of pages: " + count + '.');
System.err.println(" Only pages requested that are within the page count have been processed.");
end = count;
}
// add the page range to the arraylist to be iterated over
IntStream.range(start, end + 1).forEach(rangeList::add);
}
pages = rangeList.iterator();
} else {
pages = null;
}
return open;
}
private void setup(final String format, String outDir, final float pageScaling) throws RuntimeException {
//check output dir has separator
if (!outDir.endsWith(separator)) {
outDir += separator;
}
imageType = format;
output_dir = outDir;
this.pageScaling = pageScaling;
//check output dir has separator
if (!user_dir.endsWith(separator)) {
user_dir += separator;
}
}
/**
* Sets up an ConvertPagesToImages instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ConvertPagesToImages(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ConvertPagesToImages instance to open a PDF file contained as a BLOB within a byte[] stream
* (do not pad with additional empty bytes)
*
* @param byteArray file's BLOB
*/
public ConvertPagesToImages(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* routine to decode a file
*/
@Override
public void decodeFile(final String file_name) throws PdfException {
String name = "demo"; //set a default just in case
int pointer = file_name.lastIndexOf(separator);
if (pointer == -1) {
pointer = file_name.lastIndexOf('/');
}
if (pointer != -1) {
name = file_name.substring(pointer + 1, file_name.length() - 4);
} else if (file_name.toLowerCase().endsWith(".pdf")) {
name = file_name.substring(0, file_name.length() - 4);
}
//fix for odd files on Linux created when you view pages
if (name.startsWith(".")) {
return;
}
//create output dir for images
if (output_dir == null) {
output_dir = user_dir + "thumbnails" + separator;
}
//true as we are rendering page
decode_pdf.setExtractionMode(0, pageScaling);
//don't bother to extract text and images
fileName = file_name;
if (openPDFFile()) {
//create a directory if it doesn't exist
final File output_path = new File(output_dir);
if (!output_path.exists()) {
output_path.mkdirs();
}
/*
* allow output to multiple images with different values on each
*
* Note we REMOVE shapes as it is a new feature and we do not want to break existing functions
*/
final String separation = System.getProperty("org.jpedal.separation");
if (separation != null) {
Object[] sepValues = {7, "", Boolean.FALSE}; //default of normal
if ("all".equals(separation)) {
sepValues = new Object[]{PdfDecoderServer.RENDERIMAGES, "image_and_shapes", Boolean.FALSE,
PdfDecoderServer.RENDERIMAGES + PdfDecoderServer.REMOVE_RENDERSHAPES, "image_without_shapes", Boolean.FALSE,
PdfDecoderServer.RENDERTEXT, "text_and_shapes", Boolean.TRUE,
7, "all", Boolean.FALSE,
PdfDecoderServer.RENDERTEXT + PdfDecoderServer.REMOVE_RENDERSHAPES, "text_without_shapes", Boolean.TRUE
};
}
final int sepCount = sepValues.length;
for (int seps = 0; seps < sepCount; seps += 3) {
decode_pdf.setRenderMode((Integer) sepValues[seps]);
extractPagesAsImages(file_name, output_dir, name + '_' + sepValues[seps + 1], (Boolean) sepValues[seps + 2]); //boolean makes last transparent so we can see white text
}
} else {
//just get the page
extractPagesAsImages(file_name, output_dir, name, false);
}
}
closePDFfile();
}
public void extractPagesAsImages(final String file_name, final String output_dir, final String name, final boolean isTransparent) {
//create a directory if it doesn't exist
final File output_path = new File(output_dir);
if (!output_path.exists()) {
output_path.mkdirs();
}
final String multiPageFlag = System.getProperty("org.jpedal.multipage_tiff");
final boolean isSingleOutputFile = "true".equalsIgnoreCase(multiPageFlag);
final String tiffFlag = System.getProperty("org.jpedal.compress_tiff");
final boolean compressTiffs = "true".equalsIgnoreCase(tiffFlag);
setJPEGCompression();
//page range
if (pages == null) {
final int start = 1;
int end = getPageCount();
//limit to 1st ten pages in testing
if (end > 10 && maxPageCount > 0) {
end = maxPageCount;
}
final ArrayList<Integer> range = new ArrayList<>();
IntStream.range(start, end + 1).forEach(range::add);
pages = range.iterator();
}
try {
pages.forEachRemaining(page -> {
try {
getPage(output_dir, name, isTransparent, isSingleOutputFile, compressTiffs, page + 1, page);
} catch (final Exception e) {
throw new RuntimeException(e);
}
});
} catch (final Exception e) {
throw new RuntimeException("Exception " + e.getMessage() + " with thumbnails on File=" + file_name);
}
}
private void getPage(final String output_dir, final String name, final boolean isTransparent,
final boolean isSingleOutputFile,
final boolean compressTiffs, final int end, final int page)
throws Exception {
{ //read pages
// create a name with zeros for if more than 9 pages appears in correct order
final StringBuilder pageAsString = new StringBuilder(String.valueOf(page));
final String maxPageSize = String.valueOf(end);
final int padding = maxPageSize.length() - pageAsString.length();
for (int ii = 0; ii < padding; ii++) {
pageAsString.insert(0, '0');
}
final String image_name;
if (isSingleOutputFile) {
image_name = name;
} else {
image_name = name + "_page_" + pageAsString;
}
/*
* get PRODUCER and if OCR disable text printing
*/
final PdfFileInformation currentFileInformation = decode_pdf.getFileInformationData();
final String[] values = currentFileInformation.getFieldValues();
final String[] fields = PdfFileInformation.getFieldNames();
for (int i = 0; i < fields.length; i++) {
if ("Creator".equals(fields[i])) {
for (final String anOcr : ocr) {
if (values[i].equals(anOcr)) {
decode_pdf.setRenderMode(PdfDecoderServer.RENDERIMAGES);
}
}
}
}
/*
* get the current page as a BufferedImage
*/
BufferedImage image_to_save = getPageAsImage(page, isTransparent);
if (isTransparent && image_to_save != null && imageType.toLowerCase().startsWith("jp")) {
image_to_save = saveJPEGwithoutTransparency(image_to_save);
}
//if just gray we can reduce memory usage by converting image to Grayscale
/*
* see what Colorspaces used and reduce image if appropriate
* (only does Gray at present)
*
* null if JPedal unsure
*/
final Iterator<Integer> colorspacesUsed = decode_pdf.getPageInfo(PageInfo.COLORSPACES);
int nextID;
boolean isGrayOnly = colorspacesUsed != null; //assume true and disprove
while (colorspacesUsed != null && colorspacesUsed.hasNext()) {
nextID = colorspacesUsed.next();
if (nextID != ColorSpaces.DeviceGray && nextID != ColorSpaces.CalGray) {
isGrayOnly = false;
}
}
//draw onto GRAY image to reduce colour depth
//(converts ARGB to gray)
if (isGrayOnly && image_to_save != null) {
final BufferedImage image_to_save2 = new BufferedImage(image_to_save.getWidth(), image_to_save.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
image_to_save2.getGraphics().drawImage(image_to_save, 0, 0, null);
image_to_save = image_to_save2;
}
if (image_to_save != null) {
/*allow user to specify maximum dimension for thumbnail*/
final String maxDimensionAsString = System.getProperty("maxDimension");
int maxDimension = -1;
if (maxDimensionAsString != null) {
maxDimension = Integer.parseInt(maxDimensionAsString);
}
if (maxDimension != -1) {
image_to_save = resizeImage(image_to_save, maxDimension, imageType);
}
final String imageFormat = System.getProperty("org.jpedal.imageType");
if (imageFormat != null) {
image_to_save = convertImage(image_to_save, imageFormat);
}
if (imageType.toLowerCase().startsWith("tif")) {
saveTiff(output_dir, isSingleOutputFile, compressTiffs, page, image_name, image_to_save);
} else {
saveImage(imageType, false, false,
page, image_to_save, output_dir + image_name + '.' + imageType.toLowerCase());
}
}
//flush images in case we do more than 1 page so only contains
//images from current page
decode_pdf.flushObjectValues(true);
//flush any text data read
}
}
private static BufferedImage resizeImage(BufferedImage image_to_save, final int maxDimension, final String imageType) {
int newWidth = image_to_save.getWidth();
int newHeight = image_to_save.getHeight();
final Image scaledImage;
if (newWidth > maxDimension || newHeight > maxDimension) {
if (newWidth > newHeight) {
newWidth = maxDimension;
scaledImage = image_to_save.getScaledInstance(newWidth, -1, BufferedImage.SCALE_SMOOTH);
} else {
newHeight = maxDimension;
scaledImage = image_to_save.getScaledInstance(-1, newHeight, BufferedImage.SCALE_SMOOTH);
}
} else {
scaledImage = image_to_save.getScaledInstance(newWidth, -1, BufferedImage.SCALE_SMOOTH);
}
if (imageType.toLowerCase().startsWith("jp")) {
image_to_save = new BufferedImage(scaledImage.getWidth(null), scaledImage.getHeight(null), BufferedImage.TYPE_INT_RGB);
} else {
image_to_save = new BufferedImage(scaledImage.getWidth(null), scaledImage.getHeight(null), BufferedImage.TYPE_INT_ARGB);
}
final Graphics2D g2 = image_to_save.createGraphics();
g2.drawImage(scaledImage, 0, 0, null);
return image_to_save;
}
private static BufferedImage convertImage(BufferedImage image_to_save, final String imageFormat) {
if (isNumber(imageFormat)) {
final int iFormat = Integer.parseInt(imageFormat);
if (iFormat > -1 && iFormat < 14) {
final BufferedImage tempImage = new BufferedImage(image_to_save.getWidth(), image_to_save.getHeight(), iFormat);
final Graphics2D g = tempImage.createGraphics();
g.drawImage(image_to_save, null, null);
image_to_save = tempImage;
} else {
System.err.println("Image Type is not valid. Value should be a digit between 0 - 13 based on the BufferedImage TYPE variables.");
}
} else {
System.err.println("Image Type provided is not an Integer. Value should be a digit between 0 - 13 based on the BufferedImage TYPE variables.");
}
return image_to_save;
}
private static void saveTiff(final String output_dir, final boolean isSingleOutputFile, final boolean compressTiffs, final int page, final String image_name, final BufferedImage image_to_save) throws IOException {
final String outputFileName;
final boolean isFirstPage = page == 1;
final TiffEncoder tiffEncoder = new TiffEncoder();
if (System.getProperty("image.tiff.compression") == null) {
if (compressTiffs) {
tiffEncoder.getEncoderOptions().setCompressionFormat(TiffCompressionFormat.DEFLATE);
} else {
tiffEncoder.getEncoderOptions().setCompressionFormat(TiffCompressionFormat.NONE);
}
}
if (isSingleOutputFile) {
outputFileName = output_dir + image_name + ".tif";
final File file = new File(outputFileName);
if (isFirstPage && file.exists()) {
file.delete();
file.createNewFile();
}
tiffEncoder.append(image_to_save, outputFileName);
} else {
outputFileName = output_dir + image_name + ".tif";
final File file = new File(outputFileName);
final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
tiffEncoder.write(image_to_save, bos);
bos.flush();
bos.close();
}
}
private static BufferedImage saveJPEGwithoutTransparency(BufferedImage image_to_save) {
final BufferedImage rawVersion = image_to_save;
final int w = rawVersion.getWidth();
final int h = rawVersion.getHeight();
//blank canvas
image_to_save = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
//
final Graphics2D g2 = image_to_save.createGraphics();
//white background
g2.setPaint(Color.WHITE);
g2.fillRect(0, 0, w, h);
//paint on image
g2.drawImage(rawVersion, 0, 0, null);
return image_to_save;
}
//////////////////////////////////////////////////////////////////////////
/**
* main routine which checks parameters passed in and runs the conversion
* @param args arguments
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
System.out.println("Simple demo to extract images from a page");
//check values first and exit with info if too many
final int count = args.length;
final boolean failed = count < 2 || count > 5;
if (failed) {
System.out.println("wrong arguments entered");
System.out.println("2-5 values expected - 1. file \n 2. output location for images\n 3. output image type (png, tiff, jpeg) (optional)\n 4. Scaling (optional) \n 5. Page Range (optional)");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("you entered:\n" + arguments + "as the arguments");
} else {
try {
switch (count) {
case 2:
writeAllPagesAsImagesToDir(args[0], args[1], "png", 1.33f);
break;
case 4:
writeAllPagesAsImagesToDir(args[0], args[1], args[2], Float.parseFloat(args[3]));
break;
case 5:
writeAllPagesAsImagesToDir(args[0], args[1], args[2], args[4], Float.parseFloat(args[3]));
break;
default:
final String s = args[2].toLowerCase();
if (SupportedFormats.hasEncoderSupportForImageFormat(s)) {
writeAllPagesAsImagesToDir(args[0], args[1], s, 1.33f);
} else if (StringUtils.isNumber(s)) {
writeAllPagesAsImagesToDir(args[0], args[1], "png", Float.parseFloat(s));
} else {
System.out.println("Optional value provided does not match expected type.\n Value should be either output image type (png, tiff, jpeg) or Scaling (scaling as a float e.g. 1.0)");
}
break;
}
} catch (final PdfException ex) {
throw new RuntimeException(ex.getMessage());
}
}
}
/**
* test to see if string or number
*/
private static boolean isNumber(final String value) {
//assume true and see if proved wrong
boolean isNumber = true;
final int charCount = value.length();
for (int i = 0; i < charCount; i++) {
final char c = value.charAt(i);
if ((c < '0') | (c > '9')) {
isNumber = false;
i = charCount;
}
}
return isNumber;
}
@Override
void init() {
//mappings for non-embedded fonts to use
FontMappings.setFontReplacements();
type = ExtractTypes.RASTERIZED_PAGE;
super.init();
decode_pdf.setExtractionMode(0, pageScaling);
}
/**
* @param page Logical page number in PDF (first page is 1)
* @param isBackgroundTransparent defines if BufferedImage has a white or transparent background
* @return BufferedImage of PDF page
* @throws PdfException is any issues decoding PDF file
*/
public BufferedImage getPageAsImage(final int page, final boolean isBackgroundTransparent) throws PdfException {
checkFileOpened();
if (dimensions != null) {
decode_pdf.setExtractionMode(0, getScalingForPage(page));
}
if (!isBackgroundTransparent) {
return decode_pdf.getPageAsImage(page);
} else { //use this if you want a transparent image
return decode_pdf.getPageAsTransparentImage(page);
}
}
private float getScalingForPage(final int pageIndex) {
final float prefWidth, prefHeight;
final PdfPageData pageData = decode_pdf.getPdfPageData();
if (pageData.getRotation(pageIndex) == 90 || pageData.getRotation(pageIndex) == 270) {
prefWidth = dimensions[1];
prefHeight = dimensions[0];
} else {
prefWidth = dimensions[0];
prefHeight = dimensions[1];
}
final float crw = pageData.getCropBoxWidth2D(pageIndex);
final float crh = pageData.getCropBoxHeight2D(pageIndex);
//choose smaller scaling factor to fit
float dScaleW = prefWidth / crw;
final float dScaleH = prefHeight / crh;
if (dScaleH < dScaleW) {
dScaleW = dScaleH;
}
return dScaleW;
}
/**
* @param page Logical page number in PDF (first page is 1)
* @return BufferedImage of PDF page with white background
* @throws PdfException is any issues decoding PDF file
*/
public BufferedImage getPageAsImage(final int page) throws PdfException {
return getPageAsImage(page, false);
}
/**
* get the page numbers
*
* @return Iterator of page numbers
*/
public Iterator<Integer> getPageRange() {
if (range != null) {
return pages;
} else {
return IntStream.range(1, getPageCount() + 1).iterator();
}
}
}
|