001/* Copyright (C) 2014 konik.io
002 *
003 * This file is part of the Konik library.
004 *
005 * The Konik library is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU Affero General Public License as
007 * published by the Free Software Foundation, either version 3 of the
008 * License, or (at your option) any later version.
009 *
010 * The Konik library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU Affero General Public License for more details.
014 *
015 * You should have received a copy of the GNU Affero General Public License
016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>.
017 */
018package io.konik.carriage.pdfbox;
019
020import io.konik.carriage.utils.CallBackInputStream;
021import io.konik.harness.FileExtractor;
022import io.konik.harness.exception.InvoiceExtractionError;
023
024import java.io.IOException;
025import java.io.InputStream;
026
027import org.apache.pdfbox.io.IOUtils;
028import org.apache.pdfbox.pdmodel.PDDocument;
029import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
030import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
031import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
032
033/**
034 * The PDFBoxInvoice Extractor.
035 */
036public class PDFBoxInvoiceExtractor implements FileExtractor {
037
038   static final String NO_FILE = "Provided PDF does not contain embedded files.";
039   static final String NO_ZF_FILE = "The PDF does not contain an attached file named ZUGFeRD-invoice.xml. Error in: ";
040   static final String ZF_FILE_NAME = "ZUGFeRD-invoice.xml";
041
042   @Override
043   public byte[] extract(InputStream pdfInput) {
044      InputStream attachmentFile = null;
045      try {
046         attachmentFile = extractToStream(pdfInput);
047         return IOUtils.toByteArray(attachmentFile);
048      } catch (IOException e) {
049         throw new InvoiceExtractionError("Error extracting content from PDF",e);
050      }finally {
051         IOUtils.closeQuietly(attachmentFile);
052      }
053   }
054   
055   @Override
056   public InputStream extractToStream(InputStream pdfInput) {
057      try {
058         return extractIntern(pdfInput);
059      } catch (IOException e) {
060         throw new InvoiceExtractionError("Error extracting content from PDF",e);
061      }
062   }
063   
064   private static final InputStream extractIntern(InputStream pdfStream) throws IOException {
065      PDDocument doc = PDDocument.load(pdfStream);
066      InputStream inputStream = extractZugferdFileAttachment(doc);
067      return new CallBackInputStream(inputStream, doc);
068   }
069   
070   private static final InputStream extractZugferdFileAttachment(PDDocument doc) throws IOException {
071      PDDocumentNameDictionary nameDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog());
072      PDEmbeddedFilesNameTreeNode embeddedFiles = listEmbeddedFiles(nameDictionary);
073      return extractZugferdXmlAttachment(embeddedFiles);
074   }
075
076   private static final PDEmbeddedFilesNameTreeNode listEmbeddedFiles(PDDocumentNameDictionary names) {
077      PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
078      if (embeddedFiles == null) { throw new InvoiceExtractionError(NO_FILE); }
079      return embeddedFiles;
080   }
081   
082   private static final InputStream extractZugferdXmlAttachment(PDEmbeddedFilesNameTreeNode embeddedFiles)
083         throws IOException {
084      PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) embeddedFiles.getValue(ZF_FILE_NAME);
085      if (fileSpec == null) { throw new InvoiceExtractionError(NO_ZF_FILE + ZF_FILE_NAME); }
086      return fileSpec.getEmbeddedFile().createInputStream();
087   }
088
089
090}