2008年12月2日

Java PDF parser PDFBox

PDFBox

使用 command line 讀取文件
Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]
-password <password> Password to decrypt document
-encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)
-console Send text to console instead of file
-html Output in HTML format instead of raw text
-sort Sort the text before writing
-startPage <number> The first page to start extraction(1 based)
-endPage <number> The last page to extract(inclusive)
<PDF file> The PDF document to use
[Text File] The file to write the text to

撰寫程式讀取 test.pdf 文件
//-- Main.java --
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class Main {
public static void main(String[] args) throws Exception {
PDDocument doc = PDDocument.load("test.pdf");
PDFTextStripper stripper = new PDFTextStripper();
System.out.println(stripper.getText(doc));
}
}

使用 command line 產生文件影像檔,但效果不好,中文字也產生不出來
Usage: java org.pdfbox.PDFToImage [OPTIONS] <PDF file>
-password <password> Password to decrypt document
-imageType <image type> (BMP,bmp,jpg,JPG,wbmp,jpeg,png,PNG,JPEG,WBMP,GIF,gif)
-outputPrefix <output prefix> Filename prefix for image files
-startPage <number> The first page to start extraction(1 based)
-endPage <number> The last page to extract(inclusive)
<PDF file> The PDF document to use

撰寫程式產生文件影像檔,參考 PDFToImage 的 source code,結果相同
//-- Main.java --
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.Iterator;
import java.util.List;

import javax.imageio.IIOException;
import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import javax.imageio.stream.ImageOutputStream;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;

public class Main {
public static void main(String[] args) throws Exception {
PDDocument doc = PDDocument.load("test.pdf");
List pages = doc.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
BufferedImage image = page.convertToImage();
File file = File.createTempFile("test_", ".jpg");
System.out.println(file);
ImageOutputStream output = ImageIO.createImageOutputStream(file);
try {
boolean foundWriter = false;
Iterator writerIter = ImageIO.getImageWritersByFormatName("jpg");
while (writerIter.hasNext() && !foundWriter) {

try {
ImageWriter imageWriter = (ImageWriter) writerIter.next();
try {
ImageWriteParam writerParams = imageWriter.getDefaultWriteParam();
if (writerParams.canWriteCompressed()) {
writerParams.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
writerParams.setCompressionQuality(1.0f);
}
imageWriter.setOutput(output);
imageWriter.write(null, new IIOImage(image, null, null), writerParams);
foundWriter = true;
} finally {
imageWriter.dispose();
}
} catch (IIOException io) {
io.printStackTrace();
}
}
} finally {
output.close();
}
}
}
}

沒有留言:

網誌存檔