/*
* Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.text;
import org.jpedal.exception.PdfException;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.SecureTransformerFactory;
import org.w3c.dom.Document;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* <h2>Outline Object Data Extraction from PDF files</h2>
* <br>
* This class provides a simple Java API to extract data in the Outline Data object (if present) from a PDF file as a Document object
* and also a static convenience method if you just want to dump all the outlines from a PDF file or directory containing PDF files<br>
* <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction.</a><br>
*/
public class ExtractOutline extends BaseTextExtraction {
/**
* Sets up an ExtractOutline instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractOutline(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractOutline instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray pdf file data
*/
public ExtractOutline(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* routine to decode a file
*/
@Override
void decodeFile(final String file_name) throws PdfException {
fileName = file_name;
if (openPDFFile()) {
final Document outline = getPDFTextOutline();
/**/
final Transformer transformer;
try (InputStream stylesheet = getClass().getResourceAsStream("/org/jpedal/examples/text/xmlstyle.xslt")) {
final TransformerFactory tFactory = SecureTransformerFactory.newInstance();
transformer = tFactory.newTransformer(new StreamSource(stylesheet));
//get just the name of the file without the path to use as a sub-directory or .pdf
String name = "demo"; //set a default just in case
final int pointer = file_name.lastIndexOf(separator);
if (pointer != -1) {
name = file_name.substring(pointer + 1, file_name.length() - 4);
}
final String outputDir = output_dir + name + separator;
final DOMSource source = new DOMSource(outline);
if (source.getNode() != null) {
final File output = new File(outputDir + "outline.xml");
if (!output.exists()) {
final File createDir = new File(outputDir);
createDir.mkdirs();
output.createNewFile();
}
final FileOutputStream fos = new FileOutputStream(output);
final StreamResult result = new StreamResult(fos);
transformer.transform(source, result);
}
} catch (final IOException | TransformerException e) {
throw new PdfException(e.getMessage(), e);
}
}
}
/**
* This class will allow you to extract any Outline data via command line from a single PDF file or a directory of PDF files.
* <br>
* The example expects two:
* <ul>
* <li>Value 1 is the file name or directory of PDF files to process</li>
* <li>Value 2 is directory to write out the outline data</li>
* </ul>
*
* @param args The expected arguments are described above.
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
final int len = args.length;
switch (len) {
case 0:
System.out.println("Example takes 2 parameters");
System.out.println("Value 1 is the file name or directory of PDF files to process");
System.out.println("Value 2 is Directory for writing the data as text files");
System.exit(0);
case 2:
try {
writeAllOutlinesToDir(args[0], args[1]);
} catch (final PdfException e) {
LogWriter.writeLog(e);
}
break;
default:
System.out.println("too many arguments entered - run with no values to see defaults");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("you entered:\n" + arguments + "as the arguments");
System.exit(0);
}
}
@Override
void init() {
type = BaseTextExtraction.ExtractTypes.OUTLINE;
super.init();
}
/**
* Convenience method to write all the Outlines in a directory of PDF files
*
* @param input directory containing PDF files
* @param outputDir directory for writing out data
* @throws org.jpedal.exception.PdfException A Pdf Exception
*/
public static void writeAllOutlinesToDir(final String input, final String outputDir) throws PdfException {
final ExtractOutline extract = new ExtractOutline(input);
extract.setup(outputDir);
extract.processFiles(input);
extract.closePDFfile();
}
/**
* Convenience method to write all the Outlines in a directory of PDF files
*
* @param input file or directory containing PDF files
* @param password password to be used to open files
* @param outputDir directory for writing out data
* @throws org.jpedal.exception.PdfException A Pdf Exception
*/
public static void writeAllOutlinesToDir(final String input, final String password, final String outputDir) throws PdfException {
final ExtractOutline extract = new ExtractOutline(input);
if (password != null) {
extract.setPassword(password);
}
extract.setup(outputDir);
extract.processFiles(input);
extract.closePDFfile();
}
private void setup(String outputDir) {
//check output dir has separator
if (!outputDir.endsWith(separator)) {
outputDir += separator;
}
output_dir = outputDir;
}
/**
* gets the Document Outline object (if present) as a Document structure
*
* @return Document
*/
public Document getPDFTextOutline() {
return decode_pdf.getOutlineAsXML();
}
}
|