/*
* Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.text;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.utils.LogWriter;
import java.awt.Rectangle;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
/**
* <h2>Extract text from PDF files</h2>
* <br>
* This class provides a simple Java API to extract text from a PDF file
* and also a static convenience method if you just want to dump all the text
* from a PDF file or directory containing PDF files<br>
* <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction.</a><br>
*/
public class ExtractTextInRectangle extends BaseTextExtraction {
/**
* The available formats that text can be output as
*/
public enum OUTPUT_FORMAT {
XML, TXT
}
/** The format to output the text as */
private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TXT;
/** Whether to attempt to detect paragraphs and insert newlines into the output text */
private boolean estimateParagraphs;
/**
* Sets up an ExtractTextInRectangle instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractTextInRectangle(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractTextInRectangle instance to open a PDF File
*
* @param fileName full path to a single PDF file
* @param extractPlainText flag to extract plain text rather than XML
*/
public ExtractTextInRectangle(final String fileName, final boolean extractPlainText) {
super(fileName, extractPlainText);
init();
}
/**
* Sets up an ExtractTextInRectangle instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray pdf file data
*/
public ExtractTextInRectangle(final byte[] byteArray) {
super(byteArray);
init();
}
public void setOutputFormat(final OUTPUT_FORMAT format) {
switch (format) {
case XML:
decode_pdf.useXMLExtraction();
break;
case TXT:
default:
decode_pdf.useTextExtraction();
}
outputFormat = format;
}
public void setEstimateParagraphs(final boolean estimateParagraphs) {
this.estimateParagraphs = estimateParagraphs;
}
/**
* routine to decode a file
*/
@Override
void decodeFile(final String file_name) throws PdfException {
fileName = file_name;
if (openPDFFile()) {
String name = "demo"; //set a default just in case
final int pointer = file_name.lastIndexOf(separator);
if (pointer != -1) {
name = file_name.substring(pointer + 1, file_name.length() - 4);
}
final String outputDir = output_dir + name + separator;
//page range
final int start = 1;
int end = getPageCount();
//limit to 1st ten pages in testing
if (end > 10 && maxCount > 0 && end > maxCount) {
end = maxCount;
}
try {
for (int page = start; page < end + 1; page++) { //read pages
decodePage(page, outputDir);
}
} catch (final Exception e) {
throw new PdfException(e.getMessage(), e);
}
}
}
private void decodePage(final int page, final String outputDir) throws PdfException, IOException {
selectPage(page);
/*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
final int x1 = currentPageData.getMediaBoxX(page);
final int x2 = currentPageData.getMediaBoxWidth(page) + x1;
final int y2 = currentPageData.getMediaBoxY(page);
final int y1 = currentPageData.getMediaBoxHeight(page) + y2;
/*
text extracted by call
*/
final String text = getTextOnPage(page, x1, y1, x2, y2);
if (text != null) {
//ensure a directory for data
final File page_path = new File(outputDir + separator);
if (!page_path.exists() && !page_path.mkdirs()) {
throw new IOException("Unable to create output directory - " + page_path.getAbsolutePath());
}
final String encoding = System.getProperty("file.encoding");
final String prefix;
switch (outputFormat) {
case XML:
prefix = ".xml";
break;
case TXT:
default:
prefix = ".txt";
}
try {
try (OutputStreamWriter output_stream = new OutputStreamWriter(
new FileOutputStream(outputDir + page + prefix),
encoding
)) {
if (outputFormat == OUTPUT_FORMAT.XML) {
output_stream.write("<?xml version=\"1.1\" encoding=\"UTF-8\"?>\n");
output_stream.write("<meta>\n");
output_stream.write(
" <PAGELOCATION x1=\""
+ x1
+ "\" "
+ "y1=\""
+ y1
+ "\" "
+ "x2=\""
+ x2
+ "\" "
+ "y2=\""
+ y2
+ "\" />\n");
output_stream.write(" <ESTIMATEPARAGRAPHS value=\"" + estimateParagraphs + "\"/>\n");
output_stream.write(" <FILE value=\"" + decode_pdf.getFileName() + "\"/>\n");
output_stream.write("</meta>\n");
output_stream.write("<TEXT>\n");
//NOTE DATA IS TECHNICALLY UNICODE
output_stream.write(text); //write actual data
output_stream.write("\n</TEXT>\n");
} else {
output_stream.write(text); //write actual data
}
}
} catch (final IOException e) {
LogWriter.writeLog(e);
}
}
//remove data once written out
decode_pdf.flushObjectValues(false);
}
/**
* extract all text on page as a string value.
* <p>
* If the page contains text with multiple orientations (Left to right,
* bottom to top), only the most common orientation will be extracted and
* others will be ignored
*
* @param page number (first page is 1)
* @return String with text
* @throws PdfException if problem with parsing and extraxting text from PDF file
*/
public String getTextOnPage(final int page) throws PdfException {
checkFileOpened();
selectPage(page);
/*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
final int x1 = currentPageData.getMediaBoxX(page);
final int x2 = currentPageData.getMediaBoxWidth(page) + x1;
final int y2 = currentPageData.getMediaBoxY(page);
final int y1 = currentPageData.getMediaBoxHeight(page) + y2;
return currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, false, true);
}
/**
* extract all text on page in a specified region as a string value. If the
* page contains text with multiple orientations (Left to right, bottom to
* top), only the most common orientation will be extracted and others will
* be ignored
*
* @param page (first page is 1)
* @param rectangle - top left corner x
* @return String with text
* @throws PdfException if problem with parsing and extraxting text from PDF file
*/
public String getTextOnPage(final int page, final Rectangle rectangle) throws PdfException {
checkFileOpened();
selectPage(page);
return currentGrouping.extractTextInRectangle(
rectangle.x,
rectangle.y,
rectangle.x + rectangle.width,
rectangle.y + rectangle.height,
page,
false,
true);
}
/**
* extract all text on page in a specified region as a string value.If the
* page contains text with multiple orientations (Left to right, bottom to
* top), only the most common orientation will be extracted and others will
* be ignored
*
* @param page (first page is 1)
* @param x1 - top left corner x
* @param y1 - top left corner y
* @param x2 - bottom right corner x
* @param y2 - bottom right corner y
* @return String with text
* @throws PdfException if problem with parsing and extracting text from PDF file
*/
/*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
public String getTextOnPage(final int page, final int x1, final int y1, final int x2, final int y2) throws PdfException {
checkFileOpened();
selectPage(page);
return currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, estimateParagraphs, true);
}
/**
* This class will allow you to extract all text from page via command line from a single PDF file or a directory of PDF files.
* <br>
* The example expects two:
* <ul>
* <li>Value 1 is the file name or directory of PDF files to process</li>
* <li>Value 2 is directory to write out the data</li>
* </ul>
*
* @param args The expected arguments are described above.
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
final int len = args.length;
switch (len) {
case 0:
System.out.println("Example takes 2 parameters");
System.out.println("Value 1 is the file name or directory of PDF files to process");
System.out.println("Value 2 is Directory for writing the data as text files");
System.exit(0);
case 2:
try {
writeAllTextToDir(args[0], args[1], -1);
} catch (final PdfException e) {
LogWriter.writeLog(e);
}
break;
default:
System.out.println("too many arguments entered - run with no values to see defaults");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("you entered:\n" + arguments + "as the arguments");
System.exit(0);
}
}
@Override
void init() {
type = BaseTextExtraction.ExtractTypes.TEXT_IN_RECTANGLE;
super.init();
setOutputFormat(OUTPUT_FORMAT.TXT);
}
/**
* Convenience method to write all the text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for PDF files
* @param outputDir directory for writing out wordlists
* @param maxPages limit to just the first maxPages of a document
* @param format set the output format for the text content (TXT or XML)
* @param estimateParagraphs set if JPedal should estimate paragraph spacing in output.
* @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file
*/
public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir,
final int maxPages, final OUTPUT_FORMAT format, final boolean estimateParagraphs) throws PdfException {
writeAllTextToDir(inputDir, password, outputDir, maxPages, format, estimateParagraphs, null);
}
/**
* Convenience method to write all the text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for PDF files
* @param outputDir directory for writing out wordlists
* @param maxPages limit to just the first maxPages of a document
* @param format set the output format for the text content (TXT or XML)
* @param estimateParagraphs set if JPedal should estimate paragraph spacing in output.
* @param errorTracker a custom error tracker
* @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file
*/
public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir,
final int maxPages, final OUTPUT_FORMAT format, final boolean estimateParagraphs, final ErrorTracker errorTracker) throws PdfException {
final ExtractTextInRectangle extract = new ExtractTextInRectangle(inputDir);
if (password != null) {
extract.setPassword(password);
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setOutputFormat(format);
extract.estimateParagraphs = estimateParagraphs;
extract.setup(outputDir, maxPages);
extract.processFiles(inputDir);
extract.closePDFfile();
}
/**
* Convenience method to write all the text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for PDF files
* @param outputDir directory for writing out wordlists
* @param maxPages limit to just the first maxPages of a document
* @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file
*/
public static void writeAllTextToDir(final String inputDir, final String password, final String outputDir, final int maxPages) throws PdfException {
writeAllTextToDir(inputDir, password, outputDir, maxPages, OUTPUT_FORMAT.TXT, false, null);
}
/**
* Convenience method to write all the text in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param outputDir directory for writing out wordlists
* @param maxPages limit to just the first maxPages of a document
* @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file
*/
public static void writeAllTextToDir(final String inputDir, final String outputDir, final int maxPages) throws PdfException {
writeAllTextToDir(inputDir, null, outputDir, maxPages);
}
private void setup(String outputDir, final int maxCount) {
//check output dir has separator
if (!outputDir.endsWith(separator)) {
outputDir += separator;
}
output_dir = outputDir;
this.maxCount = maxCount;
}
}
|