/*
* Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.images;
import com.idrsolutions.image.JDeli;
import com.idrsolutions.image.tiff.TiffEncoder;
import com.idrsolutions.image.tiff.options.TiffCompressionFormat;
import com.idrsolutions.image.utility.SupportedFormats;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.objects.PdfImageData;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.SecureDocumentBuilderFactory;
import org.jpedal.utils.SecureTransformerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
/**
* <h2>Image Extraction from PDF files</h2>
* <br>
* This class provides a simple Java API to extract images from a PDF file and also
* a static convenience method if you just want to dump all the images from a PDF file
* or directory containing PDF files.<br>
* <br>
*<a href="https://support.idrsolutions.com/jpedal/tutorials/extract-images/extract-images-from-pdf">See our Support Pages for more info on Image Extraction.</a>
*/
public class ExtractImages extends BaseImageExtraction {
private boolean outputPagesInSeparateDirs = true;
private String defaultOutputDir;
private boolean writeOutMetadata = true;
/**
* Sets up an ExtractImages instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractImages(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractImages instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray pdf file data
*/
public ExtractImages(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* Convenience method to Extract all the images in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password password used to open PDF files
* @param outputDir directory for writing out images
* @param imageType 3 letter value for image format to be used
* @param generateMetaData if true include additional XML file with metadata on image
* @param outputPagesInSepDirs if true place images from each page in separate sub-directory
* @throws org.jpedal.exception.PdfException if problem with processing PDF files
*/
public static void writeAllImagesToDir(final String inputDir, final String password, final String outputDir, final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs) throws PdfException {
writeAllImagesToDir(inputDir, password, outputDir, imageType, generateMetaData, outputPagesInSepDirs, null);
}
/**
* Convenience method to Extract all the images in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param outputDir directory for writing out images
* @param imageType 3 letter value for image format to be used
* @param generateMetaData if true include additional XML file with metadata on image
* @param outputPagesInSepDirs if true place images from each page in separate sub-directory
* @throws org.jpedal.exception.PdfException if problem with processing PDF files
*/
public static void writeAllImagesToDir(final String inputDir, final String outputDir, final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs) throws PdfException {
writeAllImagesToDir(inputDir, null, outputDir, imageType, generateMetaData, outputPagesInSepDirs, null);
}
/**
* Convenience method to Extract all the images in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password password used to open PDF files
* @param outputDir directory for writing out images
* @param imageType 3 letter value for image format to be used
* @param generateMetaData if true include additional XML file with metadata on image
* @param outputPagesInSepDirs if true place images from each page in separate sub-directory
* @param errorTracker a custom error tracker
* @throws org.jpedal.exception.PdfException if problem with processing PDF files
*/
public static void writeAllImagesToDir(final String inputDir, final String password, final String outputDir,
final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs, final ErrorTracker errorTracker) throws PdfException {
if (SupportedFormats.hasEncoderSupportForImageFormat(imageType)) {
final ExtractImages extract = new ExtractImages(inputDir);
if (password != null) {
extract.setPassword(password);
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setup(outputDir, imageType, generateMetaData, outputPagesInSepDirs);
extract.processFiles(inputDir);
extract.closePDFfile();
} else {
throw new RuntimeException("Unknown image format - " + imageType);
}
}
@Override
void decodeFile(final String fileName) throws PdfException {
if (openPDFFile()) {
//page range
final int start = 1;
final int end = getPageCount();
/*
* create output dir for images
*/
if (defaultOutputDir == null) {
output_dir = user_dir + "images" + separator + name + separator;
} else {
output_dir = defaultOutputDir;
}
//create a directory if it doesn't exist
final File output_path = new File(output_dir);
if (!output_path.exists()) {
output_path.mkdirs();
}
for (int page = start; page < end + 1; page++) {
//image count (note image 1 is item 0, so any loop runs 0 to count-1)
final int image_count = getImageCount(page);
//tell user
if (image_count > 0) {
//create a directory for page our put all in same dir
String target = output_dir;
if (outputPagesInSeparateDirs) {
target = output_dir + separator + page;
}
final File page_path = new File(target);
if (!page_path.exists()) {
page_path.mkdirs();
}
}
try {
writeImagesFromPage(image_count, page);
} catch (final Exception ex) { // Cascade up
throw new PdfException(ex.getMessage(), ex);
}
// Flush images in case we do more than 1 page so only contains
// images from current page
decode_pdf.flushObjectValues(true);
}
}
}
private void writeImagesFromPage(final int image_count, final int page) throws Exception {
BufferedImage image_to_save;
String outputDir = output_dir;
if (outputPagesInSeparateDirs) {
outputDir = output_dir + page + separator;
}
// Work through and save each image
for (int i = 0; i < image_count; i++) {
final String image_name = getImageName(page, i);
//get raw version of image (R imageType for raw image)
image_to_save = getImage(page, image_name, false);
saveImage(image_to_save, outputDir + 'R' + image_name + '_' + page + '.' + imageType, imageType);
//load processed version of image (converted to rgb)
image_to_save = getImage(page, image_name, true);
//save image
if (image_to_save != null) {
saveImage(image_to_save, outputDir + image_name + '_' + page + '.' + imageType, imageType);
}
//save metadata as XML file
if (writeOutMetadata) {
outputMetaDataToXML(fileName, page, pdf_images, i, image_name);
}
}
}
@Override
void init() {
type = ExtractTypes.IMAGES;
super.init();
}
/**
* save image - different versions have different bugs for file formats so we use best for
* each image type
*
* @param image_to_save extracted image
*/
private static void saveImage(final BufferedImage image_to_save, final String fileName, final String prefix) throws Exception {
if (prefix.contains("tif")) {
final FileOutputStream os = new FileOutputStream(fileName);
//get tiff compression
final String tiffFlag = System.getProperty("org.jpedal.compress_tiff");
final boolean compressTiffs = tiffFlag != null;
final TiffEncoder tiffEncoder = new TiffEncoder();
if (compressTiffs) {
tiffEncoder.getEncoderOptions().setCompressionFormat(TiffCompressionFormat.DEFLATE);
} else {
tiffEncoder.getEncoderOptions().setCompressionFormat(TiffCompressionFormat.NONE);
}
tiffEncoder.write(image_to_save, os);
os.flush();
os.close();
} else { //other images
JDeli.write(image_to_save, prefix, new File(fileName));
}
}
/**
* write out details of image to XML file
*/
private void outputMetaDataToXML(final String file_name, final int page, final PdfImageData pdf_images, final int i, final String image_name) {
final float x1 = pdf_images.getImageXCoord(i);
final float y1 = pdf_images.getImageYCoord(i);
final float w = pdf_images.getImageWidth(i);
final float h = pdf_images.getImageHeight(i);
try {
//create doc and set root
final SecureDocumentBuilderFactory dbf = new SecureDocumentBuilderFactory();
final DocumentBuilder db = dbf.newDocumentBuilder();
final Document doc = db.newDocument();
final Node root = doc.createElement("meta");
doc.appendChild(root);
//add comments
final Node creation = doc.createComment("Created " + org.jpedal.utils.TimeNow.getShortTimeNow());
doc.appendChild(creation);
final Node info = doc.createComment("Pixel Location of image x1,y1,x2,y2");
doc.appendChild(info);
final Node moreInfo = doc.createComment("x1,y1 is top left corner origin is bottom left corner");
doc.appendChild(moreInfo);
//add location
final Element location = doc.createElement("PAGELOCATION");
location.setAttribute("x1", String.valueOf(x1));
location.setAttribute("y1", String.valueOf((y1 + h)));
location.setAttribute("x2", String.valueOf((x1 + w)));
location.setAttribute("y2", String.valueOf(y1));
root.appendChild(location);
//add pdf file extracted from
final Element fileName = doc.createElement("FILE");
fileName.setAttribute("value", file_name);
root.appendChild(fileName);
//write out
final Transformer transformer;
try (InputStream stylesheet = getClass().getResourceAsStream("/org/jpedal/examples/text/xmlstyle.xslt")) {
final TransformerFactory transformerFactory = SecureTransformerFactory.newInstance();
transformer = transformerFactory.newTransformer(new StreamSource(stylesheet));
}
String outputDir = output_dir;
if (outputPagesInSeparateDirs) {
outputDir = output_dir + page + separator;
}
transformer.transform(new DOMSource(doc), new StreamResult(outputDir + image_name + ".xml"));
} catch (final Exception e) {
LogWriter.writeLog(e);
}
}
//////////////////////////////////////////////////////////////////////////
/**
* This class will allow you to extract Images via command line from a single PDF file or a directory of PDF files.
* <br>
* The example expects three parameters:
* <ul>
* <li>Value 1 is the file name or directory of PDF files to process</li>
* <li>Value 2 is directory to write out the images</li>
* <li>Value 3 is image type (jpeg,tiff,png). Default is png</li>
* </ul>
*
* @param args The expected arguments are described above.
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
//check user has passed us a filename, output location and image type
final int len = args.length;
if (len != 3) {
System.out.println("Class takes 3 parameters: ");
System.out.println("Value 1 is the file name or directory of PDF files to process");
System.out.println("Value 2 is Directory for writing the images");
System.out.println("Value 3 is image type (jpeg,tiff,png).");
if (len > 3) {
System.out.println("\nToo many arguments entered");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("You entered:\n" + arguments);
}
} else {
try {
writeAllImagesToDir(args[0], args[1], args[2], true, false);
} catch (final PdfException e) {
LogWriter.writeLog(e);
}
}
}
/**
* extract any image from any page - recommended you process images on each page in turn as quicker
*
* @param page logical page number (1 is first page)
* @param imageNumber image on page (0 is first image)
* @param imageAsDisplayed if true return image as displayed (with scaling/rotation) otherwise use raw stored image (often but not always the same). Neither is clipped
* @return BufferedImage
* @throws PdfException if problem with extracting image from PDF file
*/
public BufferedImage getImage(final int page, final int imageNumber, final boolean imageAsDisplayed) throws PdfException {
checkFileOpened();
return getImage(page, getImageName(page, imageNumber), imageAsDisplayed);
}
/**
* extract any image from any page - recommended you process images on each page in turn as quicker
*
* @param page logical page number (1 is first page)
* @param image_name name of image
* @param imageAsDisplayed if true return image as displayed (with scaling/rotation) otherwise use raw stored image (often but not always the same). Neither is clipped
* @return BufferedImage
* @throws PdfException if problem with extracting image from PDF file
*/
private BufferedImage getImage(final int page, final String image_name, final boolean imageAsDisplayed) throws PdfException {
selectPage(page);
if (imageAsDisplayed) {
return decode_pdf.getObjectStore().loadStoredImage(image_name);
} else {
return decode_pdf.getObjectStore().loadStoredImage('R' + image_name);
}
}
private void setup(String outputDir, final String imageType, final boolean generateMetaData, final boolean outputPagesInSepDirs) {
this.imageType = imageType;
if (outputDir != null) {
//check output dir has separator
if (!outputDir.endsWith(separator)) {
outputDir += separator;
}
defaultOutputDir = outputDir;
}
writeOutMetadata = generateMetaData;
outputPagesInSeparateDirs = outputPagesInSepDirs;
}
/**
* returns an image count for the selected page
*
* @param page logical page number
* @return int number of images (0 if no images)
* @throws PdfException if problem with opening PDF file
*/
public int getImageCount(final int page) throws PdfException {
checkFileOpened();
selectPage(page);
//image count (note image 1 is item 0, so any loop runs 0 to count-1)
return pdf_images.getImageCount();
}
public PdfImageData getImageData(final int page) throws PdfException {
checkFileOpened();
selectPage(page);
return pdf_images;
}
/**
* Return name of image (composite of filename and Internal PDF image name)
*
* @param page - logical page number
* @param imageNumber - number of image (0 is first image)
* @return - String containing image name
* @throws PdfException if problem with extracting image from PDF file
*/
private String getImageName(final int page, final int imageNumber) throws PdfException {
checkFileOpened();
selectPage(page);
return pdf_images.getImageName(imageNumber);
}
}
|