001/* Copyright (C) 2014 konik.io 002 * 003 * This file is part of the Konik library. 004 * 005 * The Konik library is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU Affero General Public License as 007 * published by the Free Software Foundation, either version 3 of the 008 * License, or (at your option) any later version. 009 * 010 * The Konik library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU Affero General Public License for more details. 014 * 015 * You should have received a copy of the GNU Affero General Public License 016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>. 017 */ 018package io.konik.itext.extractor; 019 020import static com.itextpdf.text.pdf.PdfName.EF; 021import static com.itextpdf.text.pdf.PdfName.F; 022import static com.itextpdf.text.pdf.PdfReader.getStreamBytes; 023import static javax.xml.bind.JAXBContext.newInstance; 024import io.konik.harness.InvoiceExtractionError; 025import io.konik.harness.InvoiceExtractor; 026import io.konik.zugferd.Invoice; 027 028import java.io.ByteArrayInputStream; 029import java.io.IOException; 030import java.io.InputStream; 031 032import javax.inject.Named; 033import javax.inject.Singleton; 034import javax.xml.bind.JAXBElement; 035import javax.xml.bind.JAXBException; 036import javax.xml.bind.Unmarshaller; 037import javax.xml.transform.Source; 038import javax.xml.transform.stream.StreamSource; 039 040import com.itextpdf.text.pdf.PRStream; 041import com.itextpdf.text.pdf.PdfArray; 042import com.itextpdf.text.pdf.PdfDictionary; 043import com.itextpdf.text.pdf.PdfName; 044import com.itextpdf.text.pdf.PdfReader; 045import com.itextpdf.text.pdf.PdfStream; 046 047/** 048 * The Class iText Pdf Invoice Extractor. 049 * 050 */ 051@Named 052@Singleton 053public class ITextPdfInvoiceExtractor implements InvoiceExtractor { 054 055 private final static PdfName AF = new PdfName("AF"); 056 057 @Override 058 public Invoice extract(byte[] pdfIn) { 059 return extract(new ByteArrayInputStream(pdfIn)); 060 } 061 062 @Override 063 public Invoice extract(InputStream pdfStream) { 064 PdfReader reader = getPdfReader(pdfStream); 065 PdfArray af = getValidAf(reader.getCatalog()); 066 PdfDictionary fileSpec = getValidFileSpec(af); 067 PdfDictionary ef = getValidEf(fileSpec); 068 byte[] invoiceXmlContent = getFStream(ef); 069 return covertToObjectModel(invoiceXmlContent); 070 } 071 072 073 074 /** 075 * Extract invoice from PDF ot XMl byte Array 076 * 077 * @param pdfStream the pdf stream 078 * @return the byte[] of the xml ivoice contetn. 079 */ 080 public byte[] extractPlain(InputStream pdfStream) { 081 PdfReader reader = getPdfReader(pdfStream); 082 PdfArray af = getValidAf(reader.getCatalog()); 083 PdfDictionary fileSpec = getValidFileSpec(af); 084 PdfDictionary ef = getValidEf(fileSpec); 085 return getFStream(ef); 086 } 087 088 private PdfReader getPdfReader(InputStream pdfStream) { 089 try { 090 return new PdfReader(pdfStream); 091 } catch (IOException e) { 092 throw new InvoiceExtractionError("Could not read or open pdf.",e); 093 } 094 } 095 096 private PdfArray getValidAf(PdfDictionary catalog) { 097 if (catalog.contains(AF)) { 098 PdfArray af = catalog.getAsArray(AF); 099 if (!af.isEmpty() && af.getDirectObject(0).isDictionary()) 100 return af; 101 } 102 throw new InvoiceExtractionError("Pdf catalog does not contain Valid AF Entry"); 103 } 104 105 private PdfDictionary getValidFileSpec(PdfArray af) { 106 if (af.isEmpty() || af.getAsDict(0) == null) { 107 throw new InvoiceExtractionError("Pdf does not contain a FileSpec Entry"); 108 } 109 return af.getAsDict(0); 110 } 111 112 private PdfDictionary getValidEf(PdfDictionary fileSpec) { 113 if (fileSpec.contains(EF)) { 114 return fileSpec.getAsDict(EF); 115 } 116 throw new InvoiceExtractionError("Pdf catalog does not contain Valid EF Entry"); 117 } 118 119 private byte[] getFStream(PdfDictionary ef){ 120 if (ef.contains(F)) { 121 PdfStream xmlStream = ef.getAsStream(F); 122 try { 123 return getStreamBytes((PRStream) xmlStream); 124 } catch (IOException e) { 125 throw new InvoiceExtractionError("Could not extrac xml content form pdf.",e); 126 } 127 } 128 throw new InvoiceExtractionError("Pdf catalog does not contain Valid F Entry"); 129 } 130 131 static Invoice covertToObjectModel(byte[] xmlContent){ 132 try { 133 Unmarshaller unmarshaller = newInstance("io.konik.zugferd").createUnmarshaller(); 134 Source s = new StreamSource(new ByteArrayInputStream(xmlContent)); 135 JAXBElement<Invoice> invoice = unmarshaller.unmarshal(s, Invoice.class); 136 return invoice.getValue(); 137 } catch (JAXBException e) { 138 throw new InvoiceExtractionError("Could not read parse xml content",e); 139 } 140 } 141}