/*
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.idrsolutions.com
* Help section for developers at http://www.idrsolutions.com/java-pdf-library-support/
*
* (C) Copyright 1997-2013, IDRsolutions and Contributors.
*
* This file is part of JPedal
*
This source code is copyright IDRSolutions 2012
*
* ---------------
* ExtractStructuredText.java
* ---------------
*/
/**
*
* Sample code showing how jpedal library can be used with
* pdf files to extract structed text from a PDF
*
* Debugging tip: Set verbose=true in LogWriter to see what is going on.
*
* It can run from jar directly using the command
*
* java -cp libraries_needed org/jpedal/examples/text/ExtractTextInRectangle inputValues
*
* where inputValues is two space delimited input values
*
* First value: The PDF filename (including the path if needed) or a directory containing PDF files. If it contains spaces it must be enclosed by double quotes (ie "C:/Path with spaces/").
* Second value (optional): Target directory for ouput data
*
*
* For non-structured files, consider
*
* http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextAsWordlist.java.html
* http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextInRectangle.java.html
* http://files.idrsolutions.com/samplecode/org/jpedal/examples/text/ExtractTextInRectangleAsTable.java.html
*/
package org.jpedal.examples.text;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import org.jpedal.PdfDecoderInt;
import org.jpedal.PdfDecoderServer;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.utils.LogWriter;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamSource;
import javax.xml.transform.stream.StreamResult;
public class ExtractStructuredText {
/**used as part of test to limit pages to first 10*/
public static boolean isTest = false;
/**output where we put files*/
protected static String output = System.getProperty("user.dir") + "xml";
/**flag to show if we display messages*/
public static boolean showMessages = true;
/**correct separator for OS */
protected String separator = System.getProperty("file.separator");
/**the decoder object which decodes the pdf and returns a data object*/
protected PdfDecoderInt decodePdf = null;
/**location output files written to*/
protected String outputFile = "";
/**example method to open a file or dir and extract the Structured Content to outputDir*/
public ExtractStructuredText(String root, String outputDir) {
output = outputDir;
//check output dir has separator
if (!output.endsWith(separator))
output = output + separator;
//create a directory if it doesn't exist
File output_path = new File(output);
if (!output_path.exists())
output_path.mkdirs();
/**
* if file name ends pdf, do the file otherwise
* do every pdf file in the directory. We already know file or
* directory exists so no need to check that, but we do need to
* check its a directory
*/
if (root.toLowerCase().endsWith(".pdf")) {
decodeFile(root);
} else {
/**
* get list of files and check directory
*/
String[] files = null;
File inputFiles;
/**make sure name ends with a deliminator for correct path later*/
if (!root.endsWith(separator))
root = root + separator;
try {
inputFiles = new File(root);
if (!inputFiles.isDirectory()) {
System.err.println(root
+ " is not a directory. Exiting program");
}
files = inputFiles.list();
} catch (Exception ee) {
LogWriter.writeLog("Exception trying to access file "
+ ee.getMessage());
}
/**now work through all pdf files*/
long fileCount = files.length;
for (int i = 0; i < fileCount; i++) {
if (showMessages)
System.out.println(i + "/ " + fileCount + ' ' + files[i]);
if (files[i].toLowerCase().endsWith(".pdf")) {
if (showMessages)
System.out.println(root + files[i]);
decodeFile(root + files[i]);
}
}
}
}
/**
* routine to decode a file
*/
protected void decodeFile(String file_name) {
System.out.println("Processing " + file_name);
/**get just the name of the file without
* the path to use as a sub-directory or .pdf
*/
String name; //set a default just in case
//allow for both separators
int pointer = file_name.lastIndexOf('/');
int pointer2 = file_name.lastIndexOf('\\');
if(pointer2>pointer)
pointer=pointer2;
name = file_name.substring(pointer + 1, file_name.length() - 4);
/**
* create output dir for text
*/
outputFile = output + separator + name + ".xml";
/**debugging code to create a log
LogWriter.setupLogFile(true,0,"","v",false);
LogWriter.log_name = "/mnt/shared/log.txt";
*/
//PdfDecoder returns a PdfException if there is a problem
try {
decodePdf = new PdfDecoderServer(false);
if (showMessages)
System.out.println("\n----------------------------");
/**
* open the file (and read metadata including pages in file)
*/
if (showMessages)
System.out.println("Opening file :" + file_name);
decodePdf.openPdfFile(file_name);
} catch (PdfSecurityException se) {
System.err.println("Security Exception " + se
+ " in pdf code for text extraction on file "
+ decodePdf.getObjectStore().getCurrentFilename());
//e.printStackTrace();
} catch (PdfException se) {
System.err.println("Pdf Exception " + se
+ " in pdf code for text extraction on file "
+ decodePdf.getObjectStore().getCurrentFilename());
//e.printStackTrace();
} catch (Exception e) {
System.err.println("Exception " + e
+ " in pdf code for text extraction on file "
+ decodePdf.getObjectStore().getCurrentFilename());
//e.printStackTrace();
}
/**
* extract data from pdf (if allowed).
*/
if ((decodePdf.isEncrypted() && (!decodePdf.isPasswordSupplied()))
&& (!decodePdf.isExtractionAllowed())) {
if (showMessages) {
System.out.println("Encrypted settings");
System.out
.println("Please look at Viewer for code sample to handle such files");
System.out.println("Or get support/consultancy");
}
} else {
/**
* extract data from pdf
*/
try {
//read pages -if you already have code this is probably
//all you need!
Document tree = decodePdf.getMarkedContent();
if (tree == null) {
//if (showMessages)
System.out.println("No text found");
} else {
/**
* format tree
*/
InputStream stylesheet = this.getClass()
.getResourceAsStream(
"/org/jpedal/examples/text/xmlstyle.xslt");
TransformerFactory transformerFactory = TransformerFactory
.newInstance();
/**output tree*/
try {
Transformer transformer = transformerFactory
.newTransformer(new StreamSource(stylesheet));
//useful for debugging
//transformer.transform(new DOMSource(tree), new StreamResult(System.out));
if(tree==null || !tree.hasChildNodes()){
//if(debug)
System.out.println("No tree data "+tree);
return;
}
//warn user if no content present
if(!tree.getDocumentElement().hasChildNodes()){
tree.appendChild(tree.createComment("There is NO Structured text in the file to extract!!"));
tree.appendChild(tree.createComment("JPedal can only extract it if it has been added when PDF created"));
tree.appendChild(tree.createComment("Please read our blog post at http://www.jpedal.org/PDFblog/2010/09/the-easy-way-to-discover-if-a-pdf-file-contains-structured-content/ "));
}
//System.out.println("outputFile="+outputFile);
transformer.transform(new DOMSource(tree), new StreamResult(outputFile));
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
} catch (Error ee) {
ee.printStackTrace();
System.exit(1);
}
/**
* output the data
*/
if (showMessages)
System.out.println("Writing to " + outputFile);
}
if (showMessages)
System.out.println("\n----------done--------------");
//remove data once written out
decodePdf.flushObjectValues(false);
} catch (Exception e) {
decodePdf.closePdfFile();
System.err.println("Exception " + e.getMessage());
e.printStackTrace();
System.out.println(decodePdf.getObjectStore()
.getCurrentFilename());
}
/**
* flush data structures - not strictly required but included
* as example
*/
decodePdf.flushObjectValues(true); //flush any text data read
/**tell user*/
if (showMessages)
System.out.println("Text read");
/**close the pdf file*/
decodePdf.closePdfFile();
}
}
//////////////////////////////////////////////////////////////////////////
/**
* main routine which checks for any files passed and runs the demo
*/
public static void main(String[] args) {
if (showMessages)
System.out.println("Simple demo to extract text objects");
//set to default
String file_name;
//check user has passed us a filename
if (args.length == 2) {
file_name = args[0];
output = args[1];
System.out.println("File :" + file_name);
//check file exists
File pdf_file = new File(file_name);
//if file exists, open and get number of pages
if (!pdf_file.exists()) {
System.out.println("File " + file_name + " not found");
}
long now = System.currentTimeMillis();
new ExtractStructuredText(file_name,output);
long finished = System.currentTimeMillis();
if (!isTest)
System.out.println("Time taken=" + ((finished - now) / 1000));
} else {
System.out.println("Please call with parameters :-");
System.out.println("FileName");
System.out.println("outputDir");
}
}
}
|