/*
* Copyright (c) 1997-2025 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.text;
import org.jpedal.PdfDecoderServer;
import org.jpedal.examples.text.configuration.ExtractStructuredTextProperties;
import org.jpedal.examples.text.configuration.OutputModes;
import org.jpedal.examples.text.output.EPUBStructuredTextWriter;
import org.jpedal.examples.text.output.HTMLStructuredTextWriter;
import org.jpedal.examples.text.output.JSONStructuredTextWriter;
import org.jpedal.examples.text.output.StructuredTextWriter;
import org.jpedal.examples.text.output.XMLStructuredTextWriter;
import org.jpedal.examples.text.output.YAMLStructuredTextWriter;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.utils.LogWriter;
import org.w3c.dom.Document;
import java.io.IOException;
/**
* <h2>Extract Structured Content (if present) from PDF files</h2>
* <p>
* This class provides a simple Java API to extract Structured Content (if present) from a PDF file and also a static
* convenience method if you just want to dump any structured outlines from a PDF file or directory containing PDF
* files
* <p>
* If no Structure is present a blank file is returned<br>
* <p>
* For non-structured files, consider:
* <ul>
* <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextAsWordlist.java.html">ExtractTextAsWordList</a></li>
* <li><a href="http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextInRectangle.java.html">ExtractTextInRectangle</a></li>
* </ul>
* <a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction</a>
*/
public class ExtractStructuredText extends BaseTextExtraction {
ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();
private String figuresFolder;
private String figuresFormat;
/**
* Sets up an ExtractStructuredText instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractStructuredText(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractStructuredText instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray Array which will hold BLOB
*/
public ExtractStructuredText(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* Sets up an ExtractStructuredText instance to open a PDF File
*
* @param fileName full path to a single PDF file
* @param properties ExtractStructuredTextProperties object for configuring extraction
*/
public ExtractStructuredText(final String fileName, final ExtractStructuredTextProperties properties) {
super(fileName);
if (properties != null) {
this.properties = properties;
}
init();
}
/**
* Sets up an ExtractStructuredText instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray Array which will hold BLOB
* @param properties ExtractStructuredTextProperties object for configuring extraction
*/
public ExtractStructuredText(final byte[] byteArray, final ExtractStructuredTextProperties properties) {
super(byteArray);
if (properties != null) {
this.properties = properties;
}
init();
}
/**
* routine to decode a file
*/
@Override
public void decodeFile(final String file_name) throws PdfException {
fileName = file_name;
if (!openPDFFile()) {
return;
}
final OutputModes mode = properties.getFileOutputMode();
if (mode == OutputModes.EPUB) {
try {
EPUBStructuredTextWriter.write(file_name, outputDir, properties, separator, figuresFolder, figuresFormat);
} catch (final Exception e) {
throw new PdfException(e.getMessage());
}
return;
}
//read pages -if you already have code this is probably
//all you need!
final Document tree;
if (figuresFolder != null) {
try {
tree = getStructuredTextContentAndFigures(figuresFolder, figuresFormat);
} catch (final IOException e) {
throw new PdfException(e.getMessage());
}
} else {
tree = getStructuredTextContent();
}
if (tree == null) {
return;
}
switch (mode) {
case XML -> {
final StructuredTextWriter XMLwriter = new XMLStructuredTextWriter(separator);
XMLwriter.write(tree, file_name, outputDir);
}
case HTML -> {
final StructuredTextWriter HTMLwriter = new HTMLStructuredTextWriter(separator);
HTMLwriter.write(tree, file_name, outputDir);
}
case JSON -> {
final StructuredTextWriter JSONwriter = new JSONStructuredTextWriter(separator);
JSONwriter.write(tree, file_name, outputDir);
}
case YAML -> {
final StructuredTextWriter YAMLwriter = new YAMLStructuredTextWriter(separator);
YAMLwriter.write(tree, file_name, outputDir);
}
}
}
/**
* This class will allow you to extract any Structured Text data via command line from a single PDF file or a
* directory of PDF files.
* <br>
* The example expects the following parameters:
* <ul>
* <li>Value 1 is the file name or directory of PDF files to process</li>
* <li>Value 2 is the directory to write out the outline data</li>
* <li>(Optional, unless Value 4 is present then Value 3 must be present) Value 3 is the outline data file format</li>
* <li>Value 4 is the directory to write out the figures data</li>
* <li>(Optional) Value 5 is the figures output format</li>
* </ul>
*
* @param args The expected arguments are described above.
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
switch (args.length) {
case 0, 1 -> {
System.out.println("Example takes 2 parameters");
System.out.println("Value 1 is the file name or directory of PDF files to process");
System.out.println("Value 2 is the Directory for writing the outline data as text files");
System.out.println("(Optional) Value 3 is the outline output format, defaults to xml if not present");
System.out.println("Value 4 is the Directory for writing the figures data as images");
System.out.println("(Optional) Value 5 is the figures output format, defaults to jpeg if not present");
System.exit(0);
}
case 2 -> {
try {
writeAllStructuredTextOutlinesToDir(args[0], args[1]);
} catch (final PdfException e) {
LogWriter.error(e, "Exception thrown while extracting structured text");
}
}
case 3 -> {
try {
final String outputFormat = args[2];
final ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();
handleOutputFormat(outputFormat, properties);
writeAllStructuredTextOutlinesToDir(args[0], "", args[1], null, properties);
} catch (final PdfException e) {
LogWriter.error(e, "Exception thrown while extracting structured text");
}
}
case 4, 5 -> {
try {
final String outputFormat = args[2];
final ExtractStructuredTextProperties properties = new ExtractStructuredTextProperties();
handleOutputFormat(outputFormat, properties);
final String figuresFormat = args.length < 5 || args[4] == null ? "jpeg" : args[4];
writeAllStructuredTextOutlinesAndFiguresToDir(args[0], "", args[1], null, properties, args[3], figuresFormat);
} catch (final PdfException e) {
LogWriter.error(e, "Exception thrown while extracting structured text");
}
}
default -> {
System.out.println("too many arguments entered - run with no values to see defaults");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("you entered:\n" + arguments + "as the arguments");
System.exit(0);
}
}
}
/**
* Set the file output mode based on a string input
*
* @param outputFormat the output format as a string
* @param properties the properties object which will have its output mode set
*
* @throws IllegalArgumentException if an invalid output mode is supplied
*/
private static void handleOutputFormat(final String outputFormat, final ExtractStructuredTextProperties properties) throws IllegalArgumentException {
switch (outputFormat.toLowerCase()) {
case "html" -> properties.setFileOutputMode(OutputModes.HTML);
case "xml" -> properties.setFileOutputMode(OutputModes.XML);
case "json" -> properties.setFileOutputMode(OutputModes.JSON);
case "epub" -> properties.setFileOutputMode(OutputModes.EPUB);
case "yaml" -> properties.setFileOutputMode(OutputModes.YAML);
default -> {
final StringBuilder message = new StringBuilder();
message.append("Output format of \"").append(outputFormat).append(" is not recognised.\n");
message.append("Valid values are as follows,");
final OutputModes[] modes = OutputModes.values();
for (final OutputModes mode : modes) {
message.append(mode);
}
throw new IllegalArgumentException(message.toString());
}
}
}
@Override
void init() {
decode_pdf = new PdfDecoderServer(false);
PdfDecoderServer.init(false);
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out images
*
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir) throws PdfException {
writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, null, null);
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out images
* @param errorTracker a custom error tracker
*
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker) throws PdfException {
writeAllStructuredTextOutlinesToDir(inputDir, password, outputDir, errorTracker, null);
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out structured text
* @param errorTracker a custom error tracker
* @param properties a ExtractStructuredTextProperties object for configuration
*
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String password, final String outputDir, final ErrorTracker errorTracker, final ExtractStructuredTextProperties properties) throws PdfException {
final ExtractStructuredText extract;
if (properties == null) {
extract = new ExtractStructuredText(inputDir);
} else {
extract = new ExtractStructuredText(inputDir, properties);
}
if (password != null) {
extract.setPassword(password);
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setup(outputDir);
extract.processFiles(inputDir);
extract.closePDFfile();
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out structured text
* @param errorTracker a custom error tracker
* @param properties a ExtractStructuredTextProperties object for configuration
* @param figuresDir directory for writing out figures
* @param figuresFormat image file format for writing figures
*
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesAndFiguresToDir(final String inputDir, final String password,
final String outputDir, final ErrorTracker errorTracker,
final ExtractStructuredTextProperties properties,
final String figuresDir, final String figuresFormat) throws PdfException {
final ExtractStructuredText extract;
if (properties == null) {
extract = new ExtractStructuredText(inputDir);
} else {
extract = new ExtractStructuredText(inputDir, properties);
}
if (password != null) {
extract.setPassword(password);
}
if (figuresDir != null) {
extract.figuresFolder = figuresDir;
extract.figuresFormat = figuresFormat;
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setup(outputDir);
extract.processFiles(inputDir);
extract.closePDFfile();
}
/**
* Convenience method to write any Structured text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param outputDir directory for writing out images
*
* @throws org.jpedal.exception.PdfException a PDF exception
*/
public static void writeAllStructuredTextOutlinesToDir(final String inputDir, final String outputDir) throws PdfException {
writeAllStructuredTextOutlinesToDir(inputDir, null, outputDir, null, null);
}
/**
* Set up the text extraction.
* Configure output directory with correct trailing separator.
*
* @param outputDir the output directory
*/
private void setup(String outputDir) {
//check output dir has separator
if (!outputDir.endsWith(separator)) {
outputDir += separator;
}
this.outputDir = outputDir;
}
/**
* gets the Document containing any Structured text (if present) as a Document structure
* <br>
* If the Document does not contain the meta data for Structured Content, an empty Document is returned
*
* @return Document
*/
public Document getStructuredTextContent() {
return decode_pdf.getMarkedContent();
}
/**
* gets the Document containing any Structured text (if present) per page, as an array of Documents
* <br>
* If the Document does not contain the meta data for Structured Content, an empty Document is returned
*
* @return Document
*/
public Document[] getStructuredTextContentPerPage() {
return decode_pdf.getMarkedContentPerPage();
}
/**
* Gets the marked content from the Document and also writes out the figures to a supplied directory
*
* @param figureDir The directory to write the figure images
* @param imageFormat The format for white to write the figure images
*
* @return The marked content document
*
* @throws IOException If there is a problem with writing the images
*/
public Document getStructuredTextContentAndFigures(final String figureDir, final String imageFormat) throws IOException {
return decode_pdf.getMarkedContent(figureDir, imageFormat);
}
/**
* Gets the marked content from the Document and also writes out the figures to a supplied directory
*
* @param figureDir The directory to write the figure images
* @param imageFormat The format for white to write the figure images
*
* @return The marked content document as an array with each page per element
*
* @throws IOException If there is a problem with writing the images
*/
public Document[] getStructuredTextContentAndFiguresPerPage(final String figureDir, final String imageFormat) throws IOException {
return decode_pdf.getMarkedContentPerPage(figureDir, imageFormat);
}
}
|