001/* Copyright (C) 2014 konik.io 002 * 003 * This file is part of the Konik library. 004 * 005 * The Konik library is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU Affero General Public License as 007 * published by the Free Software Foundation, either version 3 of the 008 * License, or (at your option) any later version. 009 * 010 * The Konik library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU Affero General Public License for more details. 014 * 015 * You should have received a copy of the GNU Affero General Public License 016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>. 017 */ 018package io.konik.carriage.pdfbox; 019 020import io.konik.carriage.utils.CallBackInputStream; 021import io.konik.harness.FileExtractor; 022import io.konik.harness.exception.InvoiceExtractionError; 023 024import java.io.IOException; 025import java.io.InputStream; 026 027import org.apache.pdfbox.io.IOUtils; 028import org.apache.pdfbox.pdmodel.PDDocument; 029import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; 030import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; 031import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; 032 033/** 034 * The PDFBoxInvoice Extractor. 035 */ 036public class PDFBoxInvoiceExtractor implements FileExtractor { 037 038 static final String NO_FILE = "Provided PDF does not contain embedded files."; 039 static final String NO_ZF_FILE = "The PDF does not contain an attached file named ZUGFeRD-invoice.xml. Error in: "; 040 static final String ZF_FILE_NAME = "ZUGFeRD-invoice.xml"; 041 042 @Override 043 public byte[] extract(InputStream pdfInput) { 044 InputStream attachmentFile = null; 045 try { 046 attachmentFile = extractToStream(pdfInput); 047 return IOUtils.toByteArray(attachmentFile); 048 } catch (IOException e) { 049 throw new InvoiceExtractionError("Error extracting content from PDF",e); 050 }finally { 051 IOUtils.closeQuietly(attachmentFile); 052 } 053 } 054 055 @Override 056 public InputStream extractToStream(InputStream pdfInput) { 057 try { 058 return extractIntern(pdfInput); 059 } catch (IOException e) { 060 throw new InvoiceExtractionError("Error extracting content from PDF",e); 061 } 062 } 063 064 private static final InputStream extractIntern(InputStream pdfStream) throws IOException { 065 PDDocument doc = PDDocument.load(pdfStream); 066 InputStream inputStream = extractZugferdFileAttachment(doc); 067 return new CallBackInputStream(inputStream, doc); 068 } 069 070 private static final InputStream extractZugferdFileAttachment(PDDocument doc) throws IOException { 071 PDDocumentNameDictionary nameDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog()); 072 PDEmbeddedFilesNameTreeNode embeddedFiles = listEmbeddedFiles(nameDictionary); 073 return extractZugferdXmlAttachment(embeddedFiles); 074 } 075 076 private static final PDEmbeddedFilesNameTreeNode listEmbeddedFiles(PDDocumentNameDictionary names) { 077 PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); 078 if (embeddedFiles == null) { throw new InvoiceExtractionError(NO_FILE); } 079 return embeddedFiles; 080 } 081 082 private static final InputStream extractZugferdXmlAttachment(PDEmbeddedFilesNameTreeNode embeddedFiles) 083 throws IOException { 084 PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) embeddedFiles.getValue(ZF_FILE_NAME); 085 if (fileSpec == null) { throw new InvoiceExtractionError(NO_ZF_FILE + ZF_FILE_NAME); } 086 return fileSpec.getEmbeddedFile().createInputStream(); 087 } 088 089 090}