/*
* Copyright (c) 1997-2025 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.images;
import com.idrsolutions.image.JDeli;
import com.idrsolutions.image.utility.SupportedFormats;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.io.ColorSpaceConvertor;
import org.jpedal.utils.LogWriter;
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
/**
* <h2>Clipped Image Extraction from PDF files</h2>
* <br>
* This class provides a simple Java API to extract clipped images from a PDF file and also
* a static convenience method if you just want to dump all the images from a PDF file
* or directory containing PDF files at a set of sizes<br>
*
* <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-images/extract-clipped-images-from-pdf">See our support pages for more information on extracting images.</a>
*/
public class ExtractClippedImages extends BaseImageExtraction {
/**
* sizes to output at -1 means unchanged
*/
private float[] outputSizes;
/**
* target directories for files
*/
private String[] outputSizeDirectories;
/**
* sizes to output at -1 means unchanged
*/
private float[] outputScales;
/**
* target directories for files
*/
private String[] outputScaleDirectories;
/**
* background colour to add to JPEG
*/
private static final Color backgroundColor = Color.WHITE;
/**
* Sets up an ExtractClippedImages instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractClippedImages(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractClippedImages instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray pdf file data
*/
public ExtractClippedImages(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* extract any image from any page - recommended you process images on each page in turn as quicker
*
* @param page logical page number (1 is first page)
* @param imageNumber image on page (0 is first image)
* @return BufferedImage
* @throws PdfException PdfException
*/
public BufferedImage getClippedImage(final int page, final int imageNumber) throws PdfException {
return getClippedImage(page, getImageName(page, imageNumber));
}
/**
* extract any image from any page - recommended you process images on each page in turn as quicker
*
* @param page logical page number (1 is first page)
* @param image_name name of image
* @return BufferedImage
* @throws PdfException PdfException
*/
private BufferedImage getClippedImage(final int page, final String image_name) throws PdfException {
selectPage(page);
return decode_pdf.getObjectStore().loadStoredImage("CLIP_" + image_name);
}
/**
* Convenience method to Extract all the images in a directory of PDF files
*
* @param inputDir directory of input files
* @param password password to open PDF files
* @param outDir directory of output files
* @param imageType 3 letter value for image format to be used
* @param subDirs sub directory of files
* @param errorTracker a custom error tracker
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllClippedImagesToDirs(final String inputDir, final String password, final String outDir,
final String imageType, final String[] subDirs, final ErrorTracker errorTracker) throws PdfException {
if (SupportedFormats.hasEncoderSupportForImageFormat(imageType)) {
final ExtractClippedImages extract = new ExtractClippedImages(inputDir);
if (password != null) {
extract.setPassword(password);
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setup(outDir, imageType, subDirs);
extract.processFiles(inputDir);
extract.closePDFfile();
} else {
throw new RuntimeException("Unknown image format - " + imageType);
}
}
/**
* Convenience method to Extract all the images in a directory of PDF files
*
* @param inputDir directory of input files
* @param password password to open PDF files
* @param outDir directory of output files
* @param imageType 3 letter value for image format to be used
* @param subDirs sub directory of files
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllClippedImagesToDirs(final String inputDir, final String password, final String outDir, final String imageType, final String[] subDirs) throws PdfException {
writeAllClippedImagesToDirs(inputDir, password, outDir, imageType, subDirs, null);
}
/**
* Convenience method to Extract all the images in a directory of PDF files
*
* @param inputDir directory of input files
* @param outDir directory of output files
* @param imageType 3 letter value for image format to be used
* @param subDirs sub directory of files
* @throws org.jpedal.exception.PdfException PdfException
*/
public static void writeAllClippedImagesToDirs(final String inputDir, final String outDir, final String imageType, final String[] subDirs) throws PdfException {
writeAllClippedImagesToDirs(inputDir, null, outDir, imageType, subDirs, null);
}
private void setup(String outDir, final String imageType, final String[] subDirs) {
//check output dir has separator
if (!outDir.endsWith(separator)) {
outDir += separator;
}
this.imageType = imageType;
//read output values
final int outputCount = (subDirs.length) / 2;
//read and create output directories
final ArrayList<Float> sizes = new ArrayList<>();
final ArrayList<String> sizeDirs = new ArrayList<>();
final ArrayList<Float> scales = new ArrayList<>();
final ArrayList<String> scaleDirs = new ArrayList<>();
for (int i = 0; i < outputCount; i++) {
String output = outDir + subDirs[1 + (i * 2)];
if ((!output.endsWith("\\")) && (!output.endsWith("/"))) {
output += separator;
}
final File dir = new File(output);
if (!dir.exists()) {
dir.mkdirs();
}
final String value = subDirs[(i * 2)];
if (value.startsWith("x")) {
scales.add(Float.parseFloat(value.substring(1)));
scaleDirs.add(output);
} else {
sizes.add(Float.parseFloat(value));
sizeDirs.add(output);
}
}
outputSizes = new float[sizes.size()];
outputSizeDirectories = new String[sizes.size()];
for (int i = 0; i != sizes.size(); i++) {
outputSizes[i] = sizes.get(i);
outputSizeDirectories[i] = sizeDirs.get(i);
}
final int scalesCount;
if (outputSizes.length > 0) {
scalesCount = scales.size() + 1;
} else {
scalesCount = scales.size();
}
outputScales = new float[scalesCount];
outputScaleDirectories = new String[scalesCount];
for (int i = 0; i != scales.size(); i++) {
outputScales[i] = scales.get(i);
outputScaleDirectories[i] = scaleDirs.get(i);
}
if (outputSizes.length > 0) {
outputScales[scales.size()] = 1;
outputScaleDirectories[scales.size()] = null;
}
}
/**
* routine to decodeFile a PDF file
*/
@Override
public void decodeFile(final String file_name) throws PdfException {
if (openPDFFile()) {
//page range
final int start = 1;
final int end = getPageCount();
try {
for (int page = start; page < end + 1; page++) { //read pages
for (int scaleIndex = 0; scaleIndex != outputScales.length; scaleIndex++) {
LogWriter.writeLog("Decoding Page " + page);
decode_pdf.getPdfPageData().setScalingValue(outputScales[scaleIndex]);
//image count (note image 1 is item 0, so any loop runs 0 to count-1)
final int image_count = getImageCount(page);
//tell user
if (image_count > 0) {
LogWriter.writeLog("page" + ' ' + page + "contains " + image_count + " images");
} else {
LogWriter.writeLog("No bitmapped images on page " + page);
}
LogWriter.writeLog("Writing out " + image_count + " images");
//location of images
final float[] x1 = new float[image_count];
final float[] y1 = new float[image_count];
final float[] w = new float[image_count];
final float[] h = new float[image_count];
final String[] image_name = new String[image_count];
final BufferedImage[] image = new BufferedImage[image_count];
//work through and get each image details
for (int i = 0; i < image_count; i++) {
image_name[i] = getImageName(page, i);
//we need some duplicates as we update some values on merge but still need originals at end
//so easiest just to store
x1[i] = pdf_images.getImageXCoord(i);
y1[i] = pdf_images.getImageYCoord(i);
w[i] = pdf_images.getImageWidth(i);
h[i] = pdf_images.getImageHeight(i);
image[i] = getClippedImage(page, image_name[i]);
}
//save each image
for (int i = 0; i < image_count; i++) {
if (image[i] != null) {
final String entry = "<PAGELOCATION x1=\"" + x1[i] + "\" "
+ "y1=\"" + (y1[i] + h[i]) + "\" "
+ "x2=\"" + (x1[i] + w[i]) + "\" "
+ "y2=\"" + (y1[i]) + "\" />\n";
if (outputScaleDirectories[scaleIndex] == null) {
generateVersionsForSizes(file_name, page, entry, image[i], i, outputSizes.length);
} else {
generateVersion(file_name, page, entry, image[i], i, outputScaleDirectories[scaleIndex], outputScales[scaleIndex], true);
}
}
}
//flush images in case we do more than 1 page so only contains
//images from current page
decode_pdf.flushObjectValues(true);
}
}
} catch (final Exception e) {
decode_pdf.closePdfFile();
LogWriter.error(e, "Exception thrown when extracting clipped images from " + file_name);
}
}
decode_pdf.closePdfFile();
}
private void generateVersionsForSizes(final String file_name, final int page, final String s, final BufferedImage bufferedImage, final int i, final int outputCount) {
for (int versions = 0; versions < outputCount; versions++) {
generateVersion(file_name, page, s, bufferedImage, i, outputSizeDirectories[versions], outputSizes[versions], false);
}
}
private void generateVersion(final String file_name, final int page, final String s, final BufferedImage bufferedImage, final int i, final String directory, final float size, final boolean scale) {
try {
//find out format image was saved in
//load image (converted to rgb)
BufferedImage image_to_save = bufferedImage;
if (image_to_save == null) {
return;
}
int index = file_name.lastIndexOf('\\');
if (index == -1) {
index = file_name.lastIndexOf('/');
}
if (index == -1) {
index = 0;
}
final String nameToUse = file_name.substring(index, file_name.length() - 4);
final String outputName = directory + nameToUse + '_' + page + '_' + i;
float scaling = 1;
final int newHeight = image_to_save.getHeight();
if (scale) {
scaling = size;
} else {
if (size > 0) {
scaling = size / newHeight;
}
if (scaling > 1) {
scaling = 1;
}
}
if (!scale && scaling != 1) {
final Image scaledImage = image_to_save.getScaledInstance(-1, (int) size, BufferedImage.SCALE_SMOOTH);
image_to_save = new BufferedImage((int) (image_to_save.getWidth() * scaling), (int) size, BufferedImage.TYPE_INT_ARGB);
final Graphics2D g2 = image_to_save.createGraphics();
g2.drawImage(scaledImage, 0, 0, null);
}
//no transparency on JPEG so give background and draw on
if (imageType.startsWith("jp") && !"jp2".equalsIgnoreCase(imageType)) {
final int iw = image_to_save.getWidth();
final int ih = image_to_save.getHeight();
final BufferedImage background = new BufferedImage(iw, ih, BufferedImage.TYPE_INT_RGB);
final Graphics2D g2 = (Graphics2D) background.getGraphics();
g2.setPaint(backgroundColor);
g2.fillRect(0, 0, iw, ih);
g2.drawImage(image_to_save, 0, 0, null);
image_to_save = background;
}
if (image_to_save.getType() == BufferedImage.TYPE_CUSTOM) {
image_to_save = ColorSpaceConvertor.convertToARGB(image_to_save);
}
try {
JDeli.write(image_to_save, imageType, new File(outputName + '.' + imageType));
} catch (final IOException ex) {
LogWriter.writeLog("Exception in writing image " + ex);
}
//save an xml file with details
try (OutputStreamWriter output_stream = new OutputStreamWriter(new FileOutputStream(outputName + ".xml"), StandardCharsets.UTF_8)) {
output_stream.write(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
output_stream.write(
"<!-- Pixel Location of image x1,y1,x2,y2\n");
output_stream.write("(x1,y1 is top left corner)\n");
output_stream.write(
"(origin is bottom left corner) -->\n");
output_stream.write("\n\n<META>\n");
output_stream.write(s);
output_stream.write("<FILE>" + file_name + "</FILE>\n");
if (scale) {
output_stream.write("<ORIGINALHEIGHT>" + (int) (image_to_save.getHeight() / size) + "</ORIGINALHEIGHT>\n");
} else {
output_stream.write("<ORIGINALHEIGHT>" + newHeight + "</ORIGINALHEIGHT>\n");
}
output_stream.write("<SCALEDHEIGHT>" + image_to_save.getHeight() + "</SCALEDHEIGHT>\n");
output_stream.write("<SCALING>" + scaling + "</SCALING>\n");
output_stream.write("</META>\n");
} catch (final IOException e) {
LogWriter.error(e, "Exception thrown writing out details for clipped images");
}
} catch (final Exception ee) {
LogWriter.writeLog("Exception " + ee + " in extracting images");
}
}
/**
* main routine which checks for any files passed and runs the demo
*
* @param args arguments
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
final String[] subDirs = validateInputValues(args);
try {
writeAllClippedImagesToDirs(args[0], args[1], args[2], subDirs);
} catch (final PdfException e) {
throw new RuntimeException(e);
}
}
private static String[] validateInputValues(final String[] args) throws RuntimeException {
final String[] subDirs;
final String inputDir; //rootDir containing files
//exit and report if wrong number of values
if (args.length >= 5 && (args.length % 2) == 1) {
LogWriter.writeLog("Values read");
LogWriter.writeLog("inputDir=" + args[0]);
LogWriter.writeLog("type=" + args[1]);
LogWriter.writeLog("Directory and height pair values" + args[3] + " <> " + args[4] + '<');
inputDir = args[0];
final int outputCount = (args.length - 3);
subDirs = new String[outputCount];
for (int i = 0; i < outputCount; i++) {
LogWriter.writeLog(args[i + 3]);
if (((i % 2) == 0) && (!args[i + 3].matches("((-|\\+|x)?[0-9]+(\\.[0-9]+)?)+"))) {
throw new RuntimeException("Invalid value: " + args[i + 3]);
}
subDirs[i] = args[i + 3];
}
} else if (((args.length - 3) % 2) == 1) {
throw new RuntimeException("Value/Directory pairs invalid");
} else {
System.out.println("Requires");
System.out.println("inputDir processedDir imageOutputType");
System.out.println("height Directory (as many pairs as you like)");
throw new RuntimeException("Not enough parameters passed to software");
}
final File pdf_file = new File(inputDir);
if (!pdf_file.exists()) {
throw new RuntimeException("Directory " + inputDir + " not found");
}
return subDirs;
}
@Override
void init() {
type = ExtractTypes.CLIPPED_IMAGES;
super.init();
}
/**
* returns an image count for the selected page
*
* @param page logical page number
* @return int number of images (0 if no images)
* @throws PdfException PdfException
*/
public int getImageCount(final int page) throws PdfException {
decode_pdf.flushObjectValues(true);
try {
//read pages
//decode the page
decode_pdf.decodePage(page);
} catch (final Exception ex) { // Cascade up
throw new PdfException(ex.getMessage(), ex);
}
// Get the PdfImages object which now holds the images.
// Binary data is stored in a temp directory and we hold the
// image name and other info in this object
pdf_images = decode_pdf.getPdfImageData();
//image count (note image 1 is item 0, so any loop runs 0 to count-1)
return pdf_images.getImageCount();
}
/**
* Return name of image (composite of filename and Internal PDF image name)
*
* @param page - logical page number
* @param imageNumber - number of image (0 is first image)
* @return - String containing image name
* @throws PdfException PdfException
*/
private String getImageName(final int page, final int imageNumber) throws PdfException {
selectPage(page);
return pdf_images.getImageName(imageNumber);
}
}
|