001/* 002 * Copyright (C) 2014 Konik.io 003 * 004 * This file is part of Konik library. 005 * 006 * Konik library is free software: you can redistribute it and/or modify 007 * it under the terms of the GNU Affero General Public License as published by 008 * the Free Software Foundation, either version 3 of the License, or 009 * (at your option) any later version. 010 * 011 * Konik library is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU Affero General Public License for more details. 015 * 016 * You should have received a copy of the GNU Affero General Public License 017 * along with Konik library. If not, see <http://www.gnu.org/licenses/>. 018 */ 019package io.konik.itext.extractor; 020 021import static com.itextpdf.text.pdf.PdfName.EF; 022import static com.itextpdf.text.pdf.PdfName.F; 023import static com.itextpdf.text.pdf.PdfReader.getStreamBytes; 024import static javax.xml.bind.JAXBContext.newInstance; 025import io.konik.harness.InvoiceExtractionError; 026import io.konik.harness.InvoiceExtractor; 027import io.konik.zugferd.Invoice; 028 029import java.io.ByteArrayInputStream; 030import java.io.IOException; 031import java.io.InputStream; 032 033import javax.inject.Named; 034import javax.inject.Singleton; 035import javax.xml.bind.JAXBElement; 036import javax.xml.bind.JAXBException; 037import javax.xml.bind.Unmarshaller; 038import javax.xml.transform.Source; 039import javax.xml.transform.stream.StreamSource; 040 041import com.itextpdf.text.pdf.PRStream; 042import com.itextpdf.text.pdf.PdfArray; 043import com.itextpdf.text.pdf.PdfDictionary; 044import com.itextpdf.text.pdf.PdfName; 045import com.itextpdf.text.pdf.PdfReader; 046import com.itextpdf.text.pdf.PdfStream; 047 048/** 049 * The Class iText Pdf Invoice Extractor. 050 * 051 */ 052@Named 053@Singleton 054public class ITextPdfInvoiceExtractor implements InvoiceExtractor { 055 056 private final static PdfName AF = new PdfName("AF"); 057 058 @Override 059 public Invoice extract(byte[] pdfIn) { 060 return extract(new ByteArrayInputStream(pdfIn)); 061 } 062 063 @Override 064 public Invoice extract(InputStream pdfStream) { 065 PdfReader reader = getPdfReader(pdfStream); 066 PdfArray af = getValidAf(reader.getCatalog()); 067 PdfDictionary fileSpec = getValidFileSpec(af); 068 PdfDictionary ef = getValidEf(fileSpec); 069 byte[] invoiceXmlContent = getFStream(ef); 070 return covertToObjectModel(invoiceXmlContent); 071 } 072 073 private PdfReader getPdfReader(InputStream pdfStream) { 074 try { 075 return new PdfReader(pdfStream); 076 } catch (IOException e) { 077 throw new InvoiceExtractionError("Could not read or open pdf.",e); 078 } 079 } 080 081 private PdfArray getValidAf(PdfDictionary catalog) { 082 if (catalog.contains(AF)) { 083 PdfArray af = catalog.getAsArray(AF); 084 if (!af.isEmpty() && af.getDirectObject(0).isDictionary()) 085 return af; 086 } 087 throw new InvoiceExtractionError("Pdf catalog does not contain Valid AF Entry"); 088 } 089 090 private PdfDictionary getValidFileSpec(PdfArray af) { 091 if (af.isEmpty() || af.getAsDict(0) == null) { 092 throw new InvoiceExtractionError("Pdf does not contain a FileSpec Entry"); 093 } 094 return af.getAsDict(0); 095 } 096 097 private PdfDictionary getValidEf(PdfDictionary fileSpec) { 098 if (fileSpec.contains(EF)) { 099 return fileSpec.getAsDict(EF); 100 } 101 throw new InvoiceExtractionError("Pdf catalog does not contain Valid EF Entry"); 102 } 103 104 private byte[] getFStream(PdfDictionary ef){ 105 if (ef.contains(F)) { 106 PdfStream xmlStream = ef.getAsStream(F); 107 try { 108 return getStreamBytes((PRStream) xmlStream); 109 } catch (IOException e) { 110 throw new InvoiceExtractionError("Could not extrac xml content form pdf.",e); 111 } 112 } 113 throw new InvoiceExtractionError("Pdf catalog does not contain Valid F Entry"); 114 } 115 116 static Invoice covertToObjectModel(byte[] xmlContent){ 117 try { 118 Unmarshaller unmarshaller = newInstance("io.konik.zugferd").createUnmarshaller(); 119 Source s = new StreamSource(new ByteArrayInputStream(xmlContent)); 120 JAXBElement<Invoice> invoice = unmarshaller.unmarshal(s, Invoice.class); 121 return invoice.getValue(); 122 } catch (JAXBException e) { 123 throw new InvoiceExtractionError("Could not read parse xml content",e); 124 } 125 } 126}