PDFBoxメモ

この方法だと画像が取れない形式のPDFがあるっぽい。XObjectは全部取ってきて、PDXObjectImageインスタンスか、PDXObjectFormインスタンスで処理を分けてやるといい。

	public static void main(String[] args) throws IOException {
		String readFile = args[0];
		FileInputStream pdfStream = new FileInputStream(readFile);
		PDFParser pdfParser = new PDFParser(pdfStream);
		pdfParser.parse();

		int cnt = 0;
		PDDocument pdf = pdfParser.getPDDocument();
		for (Iterator<PDPage> i = pdf.getDocumentCatalog().getAllPages().iterator(); i.hasNext();) {
			Map<String, PDXObject> objs = i.next().getResources().getXObjects();
			for (Iterator<String> j = objs.keySet().iterator(); j.hasNext();) {
				PDXObject obj = objs.get(j.next());
				if (obj instanceof PDXObjectImage) {
					PDXObjectImage image = (PDXObjectImage) obj;
					image.write2file(String.valueOf(cnt));
					cnt++;
				} else if (obj instanceof PDXObjectForm) {
					PDXObjectForm form = (PDXObjectForm) obj;
					Map<String, PDXObjectImage> images = form.getResources().getImages();
					for (Iterator<String> k = images.keySet().iterator(); k.hasNext();) {
						PDXObjectImage image = images.get(k.next());
						image.write2file(String.valueOf(cnt));
						cnt++;
					}
				}
			}
		}
	}