/*
 * Decompiled with CFR 0.152.
 */
package de.citec.scie.pdf;

import de.citec.scie.pdf.DocumentBlockCleaner;
import de.citec.scie.pdf.ParagraphEstimator;
import de.citec.scie.pdf.PreTextBlock;
import de.citec.scie.pdf.PreTextLine;
import de.citec.scie.pdf.TextBlockRankEstimator;
import de.citec.scie.pdf.VerticalAlignmentEstimator;
import de.citec.scie.pdf.WhiteSpaceEstimator;
import de.citec.scie.pdf.structure.Document;
import de.citec.scie.pdf.structure.Page;
import de.citec.scie.pdf.structure.Paragraph;
import de.citec.scie.pdf.structure.Text;
import de.citec.scie.pdf.structure.TextBlock;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextPosition;

public class PDFStructuredTextExtractor {
    public static final int MINIMUMPARSIZE = 80;

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static Document importAsDocument(InputStream input) throws IOException {
        PDDocument doc = null;
        try {
            PDFParser parser = new PDFParser(input);
            parser.parse();
            doc = parser.getPDDocument();
            Document outDoc = new Document();
            List allPages = doc.getDocumentCatalog().getAllPages();
            if (allPages.isEmpty()) {
                throw new IOException("PDFBox did not find any pages!");
            }
            int pageNum = 0;
            for (Object pageOBj : allPages) {
                Page outPage = new Page();
                outPage.setPageNumber(++pageNum);
                outDoc.content.add(outPage);
                PDPage page = (PDPage)pageOBj;
                PDPagePreprocessor preProc = new PDPagePreprocessor(page);
                preProc.process();
                TextBlockRankEstimator blockRankEst = new TextBlockRankEstimator();
                for (PreTextBlock splitBlock : preProc.getPreTextBlock().split()) {
                    TextBlock outTextBlock = new TextBlock();
                    outPage.content.add(outTextBlock);
                    blockRankEst.addBlock(outTextBlock, splitBlock);
                    Paragraph outPar = new Paragraph();
                    outTextBlock.content.add(outPar);
                    Text outText = new Text();
                    outPar.content.add(outText);
                    ParagraphEstimator parEst = new ParagraphEstimator(splitBlock);
                    WhiteSpaceEstimator spaceEst = new WhiteSpaceEstimator();
                    VerticalAlignmentEstimator vAlignEst = new VerticalAlignmentEstimator(splitBlock.lines.get(0));
                    TextPosition firstGlyph = splitBlock.lines.get((int)0).content.get(0);
                    outText.setFontSize(firstGlyph.getFontSizeInPt());
                    if (firstGlyph.getFont() != null && firstGlyph.getFont().getFontDescriptor() != null) {
                        outText.setFontName(firstGlyph.getFont().getFontDescriptor().getFontName());
                    }
                    outText.setVerticalAlignment(vAlignEst.calculateAlignment(firstGlyph));
                    StringBuilder currentTextBuilder = new StringBuilder();
                    for (PreTextLine line : splitBlock.lines) {
                        char previousChar;
                        vAlignEst = new VerticalAlignmentEstimator(line);
                        if (parEst.isNewParagraph(line)) {
                            outPar = new Paragraph();
                            outTextBlock.content.add(outPar);
                            currentTextBuilder.delete(currentTextBuilder.length() - 1, currentTextBuilder.length());
                            outText.setText(currentTextBuilder.toString());
                            outText = new Text();
                            outPar.content.add(outText);
                            currentTextBuilder = new StringBuilder();
                            firstGlyph = line.content.get(0);
                            outText.setFontSize(firstGlyph.getFontSizeInPt());
                            if (firstGlyph.getFont() != null && firstGlyph.getFont().getFontDescriptor() != null) {
                                outText.setFontName(firstGlyph.getFont().getFontDescriptor().getFontName());
                            }
                        }
                        for (TextPosition glyph : line.content) {
                            String glyphFont = glyph.getFont() != null && glyph.getFont().getFontDescriptor() != null ? glyph.getFont().getFontDescriptor().getFontName() : null;
                            boolean fontEquals = glyphFont == null ? outText.getFontName() == null : (outText.getFontName() == null ? false : glyphFont.equals(outText.getFontName()));
                            float glyphFontSize = glyph.getFontSizeInPt();
                            Text.VerticalAlignment glyphAlignment = vAlignEst.calculateAlignment(glyph);
                            if (!fontEquals || glyphFontSize != outText.getFontSize() || glyphAlignment != outText.getVerticalAlignment()) {
                                outText.setText(currentTextBuilder.toString());
                                outText = new Text();
                                outPar.content.add(outText);
                                currentTextBuilder = new StringBuilder();
                                outText.setFontName(glyphFont);
                                outText.setFontSize(glyphFontSize);
                                outText.setVerticalAlignment(glyphAlignment);
                                spaceEst = new WhiteSpaceEstimator();
                            }
                            if (spaceEst.hasWhiteSpace(glyph)) {
                                currentTextBuilder.append(' ');
                            }
                            currentTextBuilder.append(glyph.getCharacter());
                        }
                        if (currentTextBuilder.length() <= 0 || (previousChar = currentTextBuilder.charAt(currentTextBuilder.length() - 1)) == '-') continue;
                        currentTextBuilder.append(' ');
                    }
                    outText.setText(currentTextBuilder.toString());
                }
                for (TextBlock outBlock : outPage.content) {
                    PDFStructuredTextExtractor.paragraphSanityCheck(outBlock);
                    outBlock.setRelativeFontSize(blockRankEst.getRelativeFontSize(outBlock));
                }
            }
            DocumentBlockCleaner cleaner = new DocumentBlockCleaner();
            cleaner.blockCleanup(outDoc);
            if (outDoc.content.isEmpty()) {
                throw new IOException("After cleanup the document contained nothing!");
            }
            Document document = outDoc;
            return document;
        }
        finally {
            if (doc != null) {
                doc.close();
            }
            input.close();
        }
    }

    public static String importAsString(InputStream input) throws IOException {
        return PDFStructuredTextExtractor.importAsDocument(input).indexedToString(0);
    }

    public static InputStream importAsInputStream(InputStream input) throws IOException {
        return new ByteArrayInputStream(PDFStructuredTextExtractor.importAsString(input).getBytes("UTF-8"));
    }

    private static void paragraphSanityCheck(TextBlock outBlock) {
        if (outBlock.content.size() > 1) {
            int accumulatedSize = 0;
            for (Paragraph par : outBlock.content) {
                for (Text text : par.content) {
                    accumulatedSize += text.getText().length();
                }
            }
            double avgSize = (double)accumulatedSize / (double)outBlock.content.size();
            if (avgSize < 80.0) {
                Paragraph newPar = new Paragraph();
                for (Paragraph par : outBlock.content) {
                    newPar.content.addAll(par.content);
                }
                outBlock.content.clear();
                outBlock.content.add(newPar);
            }
        }
    }

    private static class PDPagePreprocessor
    extends PDFStreamEngine {
        private static final String propertiesPath = "org/apache/pdfbox/resources/PDFTextStripper.properties";
        private final PDPage page;
        private final PreTextBlock preTextBlock = new PreTextBlock();

        public PDPagePreprocessor(PDPage page) throws IOException {
            super(ResourceLoader.loadProperties((String)propertiesPath, (boolean)true));
            this.page = page;
        }

        public void process() throws IOException {
            this.processStream(this.page, this.page.findResources(), this.page.getContents().getStream());
        }

        protected void processTextPosition(TextPosition text) {
            this.preTextBlock.addTextPosition(text);
        }

        public PreTextBlock getPreTextBlock() {
            return this.preTextBlock;
        }
    }
}

