/*
* Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.text;
import org.jpedal.examples.text.output.HTMLStructuredTextWriter;
import org.jpedal.examples.text.output.XMLStructuredTextWriter;
import org.jpedal.examples.text.output.StructuredTextWriter;
import org.jpedal.examples.text.configuration.ExtractStructuredTextProperties;
import org.jpedal.examples.text.configuration.OutputModes;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.utils.LogWriter;
import org.w3c.dom.Document;
/**
* <h2>Extract Structured Content (if present) from PDF files</h2>
* <br>
* This class provides a simple Java API to extract Structured Content (if present) from a PDF file and also a static convenience method if you just want to dump any structured outlines from a PDF file or directory containing PDF files<br>
* If no Structure is present a blank file is returned<br>
* <br>For non-structured files, consider:
* <ul>
* <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextAsWordlist.java.html">ExtractTextAsWordList</a></li>
* <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextInRectangle.java.html">ExtractTextInRectangle</a></li>
* </ul>
* <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction</a><br>
*/
public class ExtractStructuredText extends BaseTextExtraction {
ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();
/**
* Sets up an ExtractStructuredText instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractStructuredText(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractStructuredText instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray Array which will hold BLOB
*/
public ExtractStructuredText(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* Sets up an ExtractStructuredText instance to open a PDF File
*
* @param fileName full path to a single PDF file
* @param properties ExtractStructuredTextProperties object for configuring
* extraction
*/
public ExtractStructuredText(final String fileName, final ExtractStructuredTextProperties properties) {
super(fileName);
if (properties != null) {
this.properties = properties;
}
init();
}
/**
* Sets up an ExtractStructuredText instance to open a PDF file contained as
* a BLOB within a byte[] stream
*
* @param byteArray Array which will hold BLOB
* @param properties ExtractStructuredTextProperties object for configuring
* extraction
*/
public ExtractStructuredText(final byte[] byteArray, final ExtractStructuredTextProperties properties) {
super(byteArray);
if (properties != null) {
this.properties = properties;
}
init();
}
/**
* routine to decode a file
*/
@Override
void decodeFile(final String file_name) throws PdfException {
fileName = file_name;
if (openPDFFile()) {
//read pages -if you already have code this is probably
//all you need!
final Document tree = getStructuredTextContent();
if (tree != null) {
final OutputModes x = properties.getFileOutputMode();
switch (x) {
case XML:
final StructuredTextWriter XMLwriter = new XMLStructuredTextWriter(separator);
XMLwriter.write(tree, file_name, output_dir);
break;
case HTML:
final StructuredTextWriter HTMLwriter = new HTMLStructuredTextWriter(separator);
HTMLwriter.write(tree, file_name, output_dir);
break;
default:
break;
}
}
}
}
/**
* This class will allow you to extract any Structured Text data via command line from a single PDF file or a directory of PDF files.
* <br>
* The example expects two or three parameters:
* <ul>
* <li>Value 1 is the file name or directory of PDF files to process</li>
* <li>Value 2 is directory to write out the outline data</li>
* </ul>
*
* @param args The expected arguments are described above.
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
final int len = args.length;
switch (len) {
case 0:
case 1:
System.out.println("Example takes 2 parameters");
System.out.println("Value 1 is the file name or directory of PDF files to process");
System.out.println("Value 2 is Directory for writing the data as text files");
System.out.println("(Optional) Value 3 is the output format, defaults to xml if not present");
System.exit(0);
case 2:
try {
writeAllStructuredTextOutlinesToDir(args[0], args[1]);
} catch (final PdfException e) {
LogWriter.writeLog(e);
}
break;
case 3:
try {
final String outputFormat = args[2];
final ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();
switch (outputFormat.toLowerCase()) {
case "html" :
properties.setFileOutputMode(OutputModes.HTML);
break;
case "xml" :
properties.setFileOutputMode(OutputModes.XML);
break;
default:
final StringBuilder message = new StringBuilder();
message.append("Output format of \"").append(outputFormat).append(" is not recognised.\n");
message.append("Valid values are as follows,");
final OutputModes[] modes = OutputModes.values();
for (final OutputModes mode : modes) {
message.append(mode);
}
throw new IllegalArgumentException(message.toString());
}
writeAllStructuredTextOutlinesToDir(args[0], "", args[1], null, properties);
} catch (final PdfException e) {
LogWriter.writeLog(e);
}
break;
default:
System.out.println("too many arguments entered - run with no values to see defaults");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("you entered:\n" + arguments + "as the arguments");
System.exit(0);
}
}
@Override
void init() {
type = BaseTextExtraction.ExtractTypes.STRUCTURED_TEXT;
super.init();
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out images
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir) throws PdfException {
writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, null, null);
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out images
* @param errorTracker a custom error tracker
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker) throws PdfException {
writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, errorTracker, null);
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out images
* @param errorTracker a custom error tracker
* @param properties a ExtractStructuredTextProperties object for configuration
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker, final ExtractStructuredTextProperties properties) throws PdfException {
final ExtractStructuredText extract;
if (properties == null) {
extract = new ExtractStructuredText(inputDir);
} else {
extract = new ExtractStructuredText(inputDir, properties);
}
if (password != null) {
extract.setPassword(password);
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setup(outputDir);
extract.processFiles(inputDir);
extract.closePDFfile();
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param outputDir directory for writing out images
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String outputDir) throws PdfException {
writeAllStructuredTextOutlinesToDir(inputDir, null, outputDir, null, null);
}
private void setup(String outputDir) {
//check output dir has separator
if (!outputDir.endsWith(separator)) {
outputDir += separator;
}
output_dir = outputDir;
}
/**
* gets the Document containing any Structured text (if present) as a Document structure
* <br>
* If the Document does not contain the meta data for Structured Content, an empty Document is returned
*
* @return Document
*/
public Document getStructuredTextContent() {
return decode_pdf.getMarkedContent();
}
}
|