001/* Copyright (C) 2014 konik.io
002 *
003 * This file is part of the Konik library.
004 *
005 * The Konik library is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU Affero General Public License as
007 * published by the Free Software Foundation, either version 3 of the
008 * License, or (at your option) any later version.
009 *
010 * The Konik library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU Affero General Public License for more details.
014 *
015 * You should have received a copy of the GNU Affero General Public License
016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>.
017 */
018package io.konik.itext.extractor;
019
020import static com.itextpdf.text.pdf.PdfName.EF;
021import static com.itextpdf.text.pdf.PdfName.F;
022import static com.itextpdf.text.pdf.PdfReader.getStreamBytes;
023import static javax.xml.bind.JAXBContext.newInstance;
024import io.konik.harness.InvoiceExtractionError;
025import io.konik.harness.InvoiceExtractor;
026import io.konik.zugferd.Invoice;
027
028import java.io.ByteArrayInputStream;
029import java.io.IOException;
030import java.io.InputStream;
031
032import javax.inject.Named;
033import javax.inject.Singleton;
034import javax.xml.bind.JAXBElement;
035import javax.xml.bind.JAXBException;
036import javax.xml.bind.Unmarshaller;
037import javax.xml.transform.Source;
038import javax.xml.transform.stream.StreamSource;
039
040import com.itextpdf.text.pdf.PRStream;
041import com.itextpdf.text.pdf.PdfArray;
042import com.itextpdf.text.pdf.PdfDictionary;
043import com.itextpdf.text.pdf.PdfName;
044import com.itextpdf.text.pdf.PdfReader;
045import com.itextpdf.text.pdf.PdfStream;
046
047/**
048 * The Class iText Pdf Invoice Extractor.
049 * 
050 */
051@Named
052@Singleton
053public class ITextPdfInvoiceExtractor implements InvoiceExtractor {
054
055   private final static PdfName AF = new PdfName("AF");
056
057   @Override
058   public Invoice extract(byte[] pdfIn) {
059      return extract(new ByteArrayInputStream(pdfIn));
060   }
061   
062   @Override
063   public Invoice extract(InputStream pdfStream) {
064         PdfReader reader = getPdfReader(pdfStream);
065         PdfArray af = getValidAf(reader.getCatalog());
066         PdfDictionary fileSpec = getValidFileSpec(af); 
067         PdfDictionary ef = getValidEf(fileSpec);
068         byte[] invoiceXmlContent = getFStream(ef);
069         return covertToObjectModel(invoiceXmlContent);
070   }
071   
072   
073
074   /**
075    * Extract invoice from PDF ot XMl byte Array
076    *
077    * @param pdfStream the pdf stream
078    * @return the byte[] of the xml ivoice contetn.
079    */
080   public byte[] extractPlain(InputStream pdfStream) {
081         PdfReader reader = getPdfReader(pdfStream);
082         PdfArray af = getValidAf(reader.getCatalog());
083         PdfDictionary fileSpec = getValidFileSpec(af); 
084         PdfDictionary ef = getValidEf(fileSpec);
085         return getFStream(ef);
086   }
087
088   private PdfReader getPdfReader(InputStream pdfStream) {
089      try {
090         return new PdfReader(pdfStream);
091      } catch (IOException e) {
092         throw new InvoiceExtractionError("Could not read or open pdf.",e);
093      }
094   }
095
096   private PdfArray getValidAf(PdfDictionary catalog) {
097      if (catalog.contains(AF)) {
098         PdfArray af = catalog.getAsArray(AF);
099         if (!af.isEmpty() && af.getDirectObject(0).isDictionary())
100            return af;
101      }
102      throw new InvoiceExtractionError("Pdf catalog does not contain Valid AF Entry");
103   }
104   
105   private PdfDictionary getValidFileSpec(PdfArray af) {
106      if (af.isEmpty() || af.getAsDict(0) == null) {
107         throw new InvoiceExtractionError("Pdf does not contain a FileSpec Entry");
108      }
109      return af.getAsDict(0);
110   }
111   
112   private PdfDictionary getValidEf(PdfDictionary fileSpec) {
113      if (fileSpec.contains(EF)) {
114         return fileSpec.getAsDict(EF);
115      }
116      throw new InvoiceExtractionError("Pdf catalog does not contain Valid EF Entry");
117   }
118
119   private byte[] getFStream(PdfDictionary ef){
120      if (ef.contains(F)) {
121         PdfStream xmlStream = ef.getAsStream(F);
122         try {
123            return getStreamBytes((PRStream) xmlStream);
124         } catch (IOException e) {
125            throw new InvoiceExtractionError("Could not extrac xml content form pdf.",e);
126         }
127      }
128      throw new InvoiceExtractionError("Pdf catalog does not contain Valid F Entry");
129   }
130   
131   static Invoice covertToObjectModel(byte[] xmlContent){
132      try {
133         Unmarshaller unmarshaller = newInstance("io.konik.zugferd").createUnmarshaller();
134         Source s = new StreamSource(new ByteArrayInputStream(xmlContent));
135         JAXBElement<Invoice> invoice = unmarshaller.unmarshal(s, Invoice.class);
136         return invoice.getValue();
137      } catch (JAXBException e) {
138         throw new InvoiceExtractionError("Could not read parse xml content",e);
139      }
140   }
141}