/*
* Copyright (c) 1997-2024 IDRsolutions (https://www.idrsolutions.com)
*/
package org.jpedal.examples.text;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.SearchType;
import org.jpedal.utils.LogWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* <h2>Find text in PDF files</h2>
* <br>This class provides a simple Java API to find text in a PDF file
* and also a static convenience method if you want to search
* a PDF file or directory containing PDF files<br>
* <br><a href="https://support.idrsolutions.com/jpedal/tutorials/search/find-text-in-a-pdf-file">See our Support Pages for more information on Text Searching.</a><br>
*/
public class FindTextInRectangle extends BaseTextExtraction {
/**
* return value for testing
*/
private final ArrayList<float[]> co_ords = new ArrayList<>();
/**
* word to find
*/
private String textToFind;
/**
* Sets up an FindTextInRectangle instance to open a PDF File
*
* @param fileName full path to a single PDF file
*/
public FindTextInRectangle(final String fileName) {
super(fileName);
init();
}
/**
* Sets up an FindTextInRectangle instance to open a PDF file contained as a BLOB within a byte[] stream
*
* @param byteArray Array that will hold the BLOB
*/
public FindTextInRectangle(final byte[] byteArray) {
super(byteArray);
init();
}
/**
* routine to decode a file
*/
@Override
void decodeFile(final String file_name) throws PdfException {
fileName = file_name;
if (openPDFFile()) {
//page range
final int start = 1;
final int end = getPageCount();
try {
for (int currentPage = start; currentPage <= end; currentPage++) { //read pages
selectPage(currentPage);
/* create a grouping object to apply grouping to data*/
if (currentGrouping != null) {
/*Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
/*co-ords for start of object are returned in float object.
* if not found co-ords=null
* if found co_ords[0]=x1, co_ords[1]=y
*/
final float[] co_ords = findTextOnPage(currentPage, textToFind, SearchType.MUTLI_LINE_RESULTS);
this.co_ords.add(co_ords);
}
}
//remove data once written out
decode_pdf.flushObjectValues(false);
} catch (final Exception e) {
LogWriter.writeLog(e);
System.exit(1);
}
}
}
/**
* Return the coords for the page specified.The origin of the coords is the bottom left hand corner (on unrotated page)
*
* @param textToFind test to look for
* @param page :: Page number to check for results
* @param searchType A static int from org.jpedal.grouping.SearchType class
* @return float[] containing all coords for the page, or empty array is no results found
* <br>[0]=result x1 coord
* <br>[1]=result y1 coord
* <br>[2]=result x2 coord
* <br>[3]=result y2 coord
* <br>[4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.
* @throws org.jpedal.exception.PdfException PdfException
*/
public float[] findTextOnPage(final int page, final String textToFind, final int searchType) throws PdfException {
checkFileOpened();
selectPage(page);
return currentGrouping.findText(new String[]{textToFind}, searchType);
}
/**
* Return the coords for the page specified.The origin of the coords is the bottom left hand corner (on unrotated page)
*
* @param page page to search
* @param x1 x1
* @param y1 y1
* @param x2 x2
* @param y2 y2
* @param textToFind text to look for
* @param searchType A static int from org.jpedal.grouping.SearchType class
* @return float[] containing all coords for the page, or empty array is no results found
* <br>[0]=result x1 coord
* <br>[1]=result y1 coord
* <br>[2]=result x2 coord
* <br>[3]=result y2 coord
* <br>[4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.
* @throws PdfException pdfException
*/
public float[] findTextOnPage(final int page, final int x1, final int y1, final int x2, final int y2, final String textToFind, final int searchType) throws PdfException {
checkFileOpened();
selectPage(page);
return currentGrouping.findText(x1, y1, x2, y2, new String[]{textToFind}, searchType);
}
@Override
void init() {
type = BaseTextExtraction.ExtractTypes.FIND_TEXT_IN_RECTANGLE;
super.init();
}
/**
* Convenience method to find text in a PDF file
*
* @param inputDir a PDF file
* @param textToFind text to look for
* @return ArrayList containing set of float[] values for all pages (-1 for actual page)
* * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.
* <br>[0]=result x1 coord
* <br>[1]=result y1 coord
* <br>[2]=result x2 coord
* <br>[3]=result y2 coord
* <br>[4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored. s
* @throws org.jpedal.exception.PdfException PdfException
*/
public static List<float[]> findTextOnAllPages(final String inputDir, final String textToFind) throws PdfException {
final FindTextInRectangle extract = new FindTextInRectangle(inputDir);
extract.setup(textToFind);
extract.processFiles(inputDir);
extract.closePDFfile();
return Collections.unmodifiableList(extract.co_ords);
}
private void setup(final String textToFind) {
this.textToFind = textToFind;
}
}
|