/*
* Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.text;
import org.jpedal.exception.PdfException;
import org.jpedal.external.ErrorTracker;
import org.jpedal.external.Options;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Strip;
import java.awt.Rectangle;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
/**
* <h2>Extract words and locations from PDF files</h2>
* <br>
* This class provides a simple Java API to extract text as words and the
* location on the page from a PDF file and also a static convenience method if
* you just want to dump all the word lists from a PDF file or directory
* containing PDF files<br>
* <br><a href="https://support.idrsolutions.com/jpedal/tutorials/extract-text/">See our Support Pages for more information on Text Extraction.</a><br>
*/
public class ExtractTextAsWordlist extends BaseTextExtraction {
/**
* word count - used for testing
*/
private int wordsExtracted;
/**
* Default delimiters used to discern the boundary of words in page content
*/
private static final String defaultDelimiters = "&:=()!;.,\\/\"\"''";
/**
* Sets up an ExtractTextAsWordlist instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public ExtractTextAsWordlist(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an ExtractTextAsWordlist instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray pdf file data
*/
public ExtractTextAsWordlist(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* routine to decode a file
*/
@Override
void decodeFile(final String file_name) throws PdfException {
fileName = file_name;
if (openPDFFile()) {
/*get just the name of the file without
* the path to use as a sub-directory or .pdf
*/
String name = "demo"; //set a default just in case
final int pointer = file_name.lastIndexOf(separator);
if (pointer != -1) {
name = file_name.substring(pointer + 1, file_name.length() - 4);
}
/*
* create output dir for text
*/
final String outputDir = output_dir + separator + name + separator;
//page range
final int start = 1;
int end = getPageCount();
//limit to 1st ten pages in testing
if (end > 10 && maxCount > 0 && end > maxCount) {
end = maxCount;
}
try {
for (int page = start; page < end + 1; page++) { //read pages
selectPage(page);
final List<String> words = getWordsOnPage(page, defaultDelimiters);
if (words != null) {
//create a directory if it doesn't exist
final File output_path = new File(outputDir);
if (!output_path.exists()) {
output_path.mkdirs();
}
/*each word is stored as 5 consecutive values (word,x1,y1,x2,y2)*/
final int wordCount = words.size() / 5;
//update our count
wordsExtracted += wordCount;
try (OutputStreamWriter output_stream = new OutputStreamWriter(
new FileOutputStream(outputDir + "words-" + page + ".txt"),
StandardCharsets.UTF_8
)) {
final Iterator<String> wordIterator = words.iterator();
while (wordIterator.hasNext()) {
String currentWord = wordIterator.next();
/*remove the XML formatting if present - not needed for pure text*/
currentWord = Strip.convertToText(currentWord, decode_pdf.isXMLExtraction());
/*
* these co-ordinates are absolute from the bottom of the page (MediaBox)
* If you are extracting image (which may use crop, use need to modify as below
*/
final int wx1 = (int) Float.parseFloat(wordIterator.next());
final int wy1 = (int) Float.parseFloat(wordIterator.next());
final int wx2 = (int) Float.parseFloat(wordIterator.next());
final int wy2 = (int) Float.parseFloat(wordIterator.next());
/*this could be inserting into a database instead*/
output_stream.write(currentWord + ',' + wx1 + ',' + wy1 + ',' + wx2 + ',' + wy2 + '\n');
}
}
}
}
//remove data once written out
decode_pdf.flushObjectValues(false);
} catch (final Exception e) {
LogWriter.writeLog(e);
throw new PdfException(e.getMessage(), e);
}
}
}
/**
* Gets the individual words from the pages text content and returns them.
* Uses a default set of delimiters to determine word bounds.
*
* @param page The page to get text content from.
* @return List object containing all words found on the page.
* @throws PdfException if problem with parsing and extraxting text from PDF file
*/
public List<String> getWordsOnPage(final int page) throws PdfException {
checkFileOpened();
selectPage(page);
final int x1 = currentPageData.getMediaBoxX(page);
final int x2 = currentPageData.getMediaBoxWidth(page) + x1;
final int y2 = currentPageData.getMediaBoxX(page);
final int y1 = currentPageData.getMediaBoxHeight(page) - y2;
return getWordsOnPage(page, x1, y1, x2, y2, defaultDelimiters);
}
/**
* Gets the individual words from the pages text content and returns them.
* Uses the provided delimiters to determine word bounds.
*
* @param page The page to get text content from.
* @param delimiters A String of characters to be used as delimiters for words.
* @return List object containing all words found on the page.
* @throws PdfException if problem with parsing and extraxting text from PDF file
*/
public List<String> getWordsOnPage(final int page, final String delimiters) throws PdfException {
checkFileOpened();
selectPage(page);
final int x1 = currentPageData.getMediaBoxX(page);
final int x2 = currentPageData.getMediaBoxWidth(page) + x1;
final int y2 = currentPageData.getMediaBoxX(page);
final int y1 = currentPageData.getMediaBoxHeight(page) - y2;
return getWordsOnPage(page, x1, y1, x2, y2, delimiters);
}
/**
* Gets the individual words from the pages text content with a greater degree of control.
*
* @param page The page to get text content from.
* @param x1 The left most point to extract from.
* @param y1 The top most point to extract from.
* @param x2 The right most point to extract from.
* @param y2 The bottom most point to extract from.
* @param delimiters key to separate values
* @return List object containing all words found on the page.
* @throws PdfException if problem with parsing and extraxting text from PDF file
*/
public List<String> getWordsOnPage(final int page, final int x1, final int y1, final int x2, final int y2, final String delimiters) throws PdfException {
checkFileOpened();
selectPage(page);
return currentGrouping.extractTextAsWordlist(
x1,
y1,
x2,
y2,
page,
true, delimiters);
}
/**
* Gets the individual words from the pages text content with a greater degree of control.
*
* @param page The page to get text content from.
* @param rectangle Rectangle area on the page to extract words from.
* @param delimiters separator used for output
* @return List object containing all words found on the page.
* @throws PdfException if problem with parsing and extraxting text from PDF file
*/
public List<String> getWordsOnPage(final int page, final Rectangle rectangle, final String delimiters) throws PdfException {
checkFileOpened();
selectPage(page);
return currentGrouping.extractTextAsWordlist(
rectangle.x,
rectangle.y,
rectangle.x + rectangle.width,
rectangle.y + rectangle.height,
page,
true, delimiters);
}
/**
* This class will allow you to extract any Words from page as a list via command line from a single PDF file or a directory of PDF files.
* <br>
* The example expects two:
* <ul>
* <li>Value 1 is the file name or directory of PDF files to process</li>
* <li>Value 2 is directory to write out the outline data</li>
* </ul>
*
* @param args The expected arguments are described above.
*/
@SuppressWarnings("unused")
public static void main(final String[] args) {
final int len = args.length;
switch (len) {
case 0:
System.out.println("Example takes 2 parameters");
System.out.println("Value 1 is the file name or directory of PDF files to process");
System.out.println("Value 2 is Directory for writing the data as text files");
System.exit(0);
case 2:
try {
writeAllWordlistsToDir(args[0], args[1], -1);
} catch (final PdfException e) {
LogWriter.writeLog(e);
}
break;
default:
System.out.println("too many arguments entered - run with no values to see defaults");
final StringBuilder arguments = new StringBuilder();
for (final String arg : args) {
arguments.append(arg).append('\n');
}
System.out.println("you entered:\n" + arguments + "as the arguments");
System.exit(0);
}
}
@Override
void init() {
type = BaseTextExtraction.ExtractTypes.TEXT_AS_WORDLIST;
super.init();
}
/**
* Convenience method to write all the Wordlists in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out wordlists
* @param maxPages limit to the first pages up to this page
* @return count of words extracted in total
* @throws org.jpedal.exception.PdfException if problem with parsing and extraxting text from PDF file
*/
public static int writeAllWordlistsToDir(final String inputDir, final String password, final String outputDir, final int maxPages) throws PdfException {
return writeAllWordlistsToDir(inputDir, password, outputDir, maxPages, null);
}
/**
* Convenience method to write all the Wordlists in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param password user or owner password for pdf file
* @param outputDir directory for writing out wordlists
* @param maxPages limit to the first pages up to this page
* @param errorTracker a custom error tracker
* @return count of words extracted in total
* @throws org.jpedal.exception.PdfException if problem with parsing and extraxting text from PDF file
*/
public static int writeAllWordlistsToDir(final String inputDir, final String password, final String outputDir,
final int maxPages, final ErrorTracker errorTracker) throws PdfException {
final ExtractTextAsWordlist extract = new ExtractTextAsWordlist(inputDir);
if (password != null) {
extract.setPassword(password);
}
if (errorTracker != null) {
extract.decode_pdf.addExternalHandler(extract, Options.ErrorTracker);
}
extract.setup(outputDir, maxPages);
extract.processFiles(inputDir);
extract.closePDFfile();
return extract.wordsExtracted;
}
/**
* Convenience method to write all the Wordlists in a directory of PDF files
*
* @param inputDir directory containing PDF files
* @param outputDir directory for writing out wordlists
* @param maxPages limit to just the first maxPages of a document
* @return count of number of words extracted
* @throws org.jpedal.exception.PdfException if problem with parsing and extracting text from PDF file
*/
public static int writeAllWordlistsToDir(final String inputDir, final String outputDir, final int maxPages) throws PdfException {
return writeAllWordlistsToDir(inputDir, null, outputDir, maxPages, null);
}
private void setup(String outputDir, final int maxCount) {
//check output dir has separator
if (!outputDir.endsWith(separator)) {
outputDir += separator;
}
output_dir = outputDir;
this.maxCount = maxCount;
}
}
|