/*
 * Decompiled with CFR 0.152.
 */
package de.digitalcollections.solrocr.formats.alto;

import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.iter.TagBreakIterator;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.lucene.search.uhighlight.Passage;

public class AltoPassageFormatter
extends OcrPassageFormatter {
    private static final String START_HL = "@@STARTHLTAG@@";
    private static final String END_HL = "@@ENDHLTAG@@";
    private static final Pattern pagePat = Pattern.compile("<Page ?(?<attribs>.+?)/?>");
    private static final Pattern wordPat = Pattern.compile("<String ?(?<attribs>.+?)/?>");
    private static final Pattern attribPat = Pattern.compile("(?<key>[A-Z_]+?)=\"(?<val>.+?)\"");
    private static final Pattern postContentPat = Pattern.compile("[\"']\\s*(\\w|/?>)");
    private final TagBreakIterator pageIter = new TagBreakIterator("Page");

    protected AltoPassageFormatter(String startHlTag, String endHlTag, boolean absoluteHighlights, boolean alignSpans) {
        super(startHlTag, endHlTag, absoluteHighlights, alignSpans);
    }

    private Map<String, String> parseAttribs(String attribStr) {
        HashMap<String, String> attribs = new HashMap<String, String>();
        Matcher m = attribPat.matcher(attribStr);
        while (m.find()) {
            attribs.put(m.group("key"), m.group("val"));
        }
        return attribs;
    }

    private OcrPage parsePage(Map<String, String> attribs) {
        Dimension dims = null;
        if (attribs.containsKey("WIDTH") && attribs.containsKey("HEIGHT")) {
            try {
                dims = new Dimension((int)Double.parseDouble(attribs.get("WIDTH")), (int)Double.parseDouble(attribs.get("HEIGHT")));
            }
            catch (NumberFormatException numberFormatException) {
                // empty catch block
            }
        }
        return new OcrPage(attribs.get("ID"), dims);
    }

    @Override
    public OcrPage determineStartPage(String ocrFragment, int startOffset, IterableCharSequence content) {
        this.pageIter.setText(content);
        int pageOffset = this.pageIter.preceding(startOffset);
        String pageFragment = content.subSequence(pageOffset, Math.min(pageOffset + 512, content.length())).toString();
        Matcher m = pagePat.matcher(pageFragment);
        if (m.find()) {
            Map<String, String> attribs = this.parseAttribs(m.group("attribs"));
            return this.parsePage(attribs);
        }
        return null;
    }

    @Override
    protected TreeMap<Integer, OcrPage> parsePages(String ocrFragment) {
        TreeMap<Integer, OcrPage> map = new TreeMap<Integer, OcrPage>();
        Matcher m = pagePat.matcher(ocrFragment);
        while (m.find()) {
            map.put(m.start(), this.parsePage(this.parseAttribs(m.group("attribs"))));
        }
        return map;
    }

    @Override
    protected String getTextFromXml(String altoFragment) {
        Matcher m;
        StringBuilder sb = new StringBuilder(altoFragment.replaceAll(this.startHlTag, START_HL).replaceAll(this.endHlTag, END_HL).replaceAll("<SP.*?>", " ").replaceAll("(</?)?TextLine.*?>", " ").replaceAll("(?s)<Description>.+?</Description>", ""));
        boolean isBeginning = true;
        while ((m = wordPat.matcher(sb)).find()) {
            int start = m.start();
            int end = m.end();
            Map<String, String> attribs = this.parseAttribs(m.group("attribs"));
            String content = "HypPart1".equals(attribs.get("SUBS_TYPE")) ? (m.find() ? attribs.get("SUBS_CONTENT") : attribs.get("CONTENT")) : ("HypPart2".equals(attribs.get("SUBS_TYPE")) ? (isBeginning ? attribs.get("CONTENT") : "") : attribs.get("CONTENT"));
            sb.replace(start, end, content);
            isBeginning = false;
        }
        return StringEscapeUtils.unescapeXml(sb.toString().replaceAll("</?[A-Z]?.*?>", "")).replaceAll("\n", "").replaceAll("\\s+", " ").trim().replaceAll(START_HL, this.startHlTag).replaceAll(END_HL, this.endHlTag);
    }

    @Override
    protected String getHighlightedFragment(Passage passage, IterableCharSequence content) {
        StringBuilder sb = new StringBuilder(content.subSequence(passage.getStartOffset(), passage.getEndOffset()));
        int extraChars = 0;
        if (passage.getNumMatches() > 0) {
            List<OcrPassageFormatter.PassageMatch> matches = this.mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds());
            for (OcrPassageFormatter.PassageMatch match : matches) {
                String postMatchContent;
                Matcher m;
                String preMatchContent = content.subSequence(passage.getStartOffset(), match.start).toString();
                int matchStart = preMatchContent.length();
                if (this.alignSpans) {
                    matchStart = preMatchContent.lastIndexOf("CONTENT=") + 9;
                }
                sb.insert(matchStart + extraChars, this.startHlTag);
                int matchEnd = Math.min(content.subSequence(passage.getStartOffset(), match.end).toString().length() + (extraChars += this.startHlTag.length()), sb.length());
                if (this.alignSpans && matchEnd != sb.length() && (m = postContentPat.matcher(postMatchContent = sb.substring(matchEnd, sb.length()))).find()) {
                    matchEnd += m.start();
                }
                sb.insert(matchEnd, this.endHlTag);
                extraChars += this.endHlTag.length();
            }
        }
        return sb.toString();
    }

    @Override
    protected List<OcrBox> parseWords(String ocrFragment, TreeMap<Integer, OcrPage> pages, String startPage) {
        ocrFragment = ocrFragment.replaceAll(this.startHlTag, START_HL).replaceAll(this.endHlTag, END_HL);
        ArrayList<OcrBox> wordBoxes = new ArrayList<OcrBox>();
        Matcher m = wordPat.matcher(ocrFragment);
        UUID currentHighlight = null;
        boolean highlightHyphenEnd = false;
        while (m.find()) {
            Boolean hyphenStart;
            String pageId = startPage;
            if (pages.floorKey(m.start()) != null) {
                pageId = pages.floorEntry((Integer)Integer.valueOf((int)m.start())).getValue().id;
            }
            Map<String, String> attribs = this.parseAttribs(m.group("attribs"));
            int x = (int)Double.parseDouble(attribs.get("HPOS"));
            int y = (int)Double.parseDouble(attribs.get("VPOS"));
            int w = (int)Double.parseDouble(attribs.get("WIDTH"));
            int h = (int)Double.parseDouble(attribs.get("HEIGHT"));
            String subsType = attribs.get("SUBS_TYPE");
            String text = StringEscapeUtils.unescapeXml(attribs.get("CONTENT"));
            Boolean bl = hyphenStart = subsType == null ? null : Boolean.valueOf("HypPart1".equals(subsType));
            if (hyphenStart != null && hyphenStart.booleanValue()) {
                text = text + "-";
            }
            if (text.contains(START_HL) || attribs.getOrDefault("SUBS_CONTENT", "").contains(START_HL)) {
                currentHighlight = UUID.randomUUID();
            }
            OcrBox ocrBox = new OcrBox(text.replace(START_HL, this.startHlTag).replace(END_HL, this.endHlTag), pageId, x, y, x + w, y + h, currentHighlight);
            if (hyphenStart != null) {
                String dehyphenated = attribs.get("SUBS_CONTENT").replace(START_HL, this.startHlTag).replace(END_HL, this.endHlTag);
                ocrBox.setHyphenInfo(hyphenStart, dehyphenated);
            }
            wordBoxes.add(ocrBox);
            if (currentHighlight != null && subsType != null) {
                if (subsType.equals("HypPart1") && attribs.get("SUBS_CONTENT").contains(END_HL)) {
                    highlightHyphenEnd = true;
                    continue;
                }
                if (!highlightHyphenEnd) continue;
                highlightHyphenEnd = false;
                currentHighlight = null;
                continue;
            }
            if (text.contains(END_HL)) {
                currentHighlight = null;
                continue;
            }
            if (!ocrFragment.substring(m.end(), Math.min(m.end() + END_HL.length(), ocrFragment.length())).equals(END_HL)) continue;
            currentHighlight = null;
        }
        return wordBoxes;
    }
}

