/*
 * Decompiled with CFR 0.152.
 */
package de.digitalcollections.solrocr.formats.hocr;

import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.formats.hocr.HocrClassBreakIterator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;

public class HocrPassageFormatter
extends OcrPassageFormatter {
    private static final Pattern wordPat = Pattern.compile("<span class=['\"]ocrx_word['\"].+?title=['\"].*?bbox (?<ulx>\\d+) (?<uly>\\d+) (?<lrx>\\d+) (?<lry>\\d+);?.*?>(?<text>.+?)</span>");
    private static final Pattern pageElemPat = Pattern.compile("<div.+?class=['\"]ocr_page['\"]\\s*(?<attribs>.+?)>");
    private static final Pattern pageIdPat = Pattern.compile("(?:id=['\"](?<id>.+?)['\"]|x_source (?<source>.+?)['\";]|ppageno (?<pageno>\\d+))");
    private static final Pattern pageBboxPat = Pattern.compile("bbox 0 0 (?<width>\\d+) (?<height>\\d+)");
    private final HocrClassBreakIterator pageIter = new HocrClassBreakIterator("ocr_page");
    private final String startHlTag;
    private final String endHlTag;

    public HocrPassageFormatter(String startHlTag, String endHlTag, boolean absoluteHighlights, boolean alignSpans) {
        super(startHlTag, endHlTag, absoluteHighlights, alignSpans);
        this.startHlTag = startHlTag;
        this.endHlTag = endHlTag;
    }

    private OcrPage parsePage(String pageAttribs, int pagePos) {
        RuntimeException noPageIdExc = new RuntimeException("Pages must have an identifier, check your source files!");
        if (pageAttribs == null) {
            throw noPageIdExc;
        }
        Matcher idMatch = pageIdPat.matcher(pageAttribs);
        if (!idMatch.find()) {
            throw noPageIdExc;
        }
        String pageId = Stream.of("id", "source", "pageno").map(idMatch::group).filter(StringUtils::isNotEmpty).findFirst().orElseThrow(() -> noPageIdExc);
        Dimension pageDims = null;
        Matcher boxMatch = pageBboxPat.matcher(pageAttribs);
        if (boxMatch.find()) {
            pageDims = new Dimension(Integer.parseInt(boxMatch.group("width")), Integer.parseInt(boxMatch.group("height")));
        }
        return new OcrPage(pageId, pageDims);
    }

    @Override
    public OcrPage determineStartPage(String ocrFragment, int startOffset, IterableCharSequence content) {
        this.pageIter.setText(content);
        int pageOffset = this.pageIter.preceding(startOffset);
        String pageFragment = content.subSequence(pageOffset, Math.min(pageOffset + 256, content.length())).toString();
        Matcher m = pageElemPat.matcher(pageFragment);
        if (m.find()) {
            return this.parsePage(m.group("attribs"), m.start());
        }
        return null;
    }

    @Override
    protected TreeMap<Integer, OcrPage> parsePages(String ocrFragment) {
        TreeMap<Integer, OcrPage> map = new TreeMap<Integer, OcrPage>();
        Matcher m = pageElemPat.matcher(ocrFragment);
        while (m.find()) {
            OcrPage page = this.parsePage(m.group("attribs"), m.start());
            map.put(m.start(), page);
        }
        return map;
    }

    @Override
    protected List<OcrBox> parseWords(String ocrFragment, TreeMap<Integer, OcrPage> pages, String startPage) {
        ArrayList<OcrBox> wordBoxes = new ArrayList<OcrBox>();
        Matcher m = wordPat.matcher(ocrFragment);
        UUID currentHighlight = null;
        while (m.find()) {
            String pageId = startPage;
            if (pages.floorKey(m.start()) != null) {
                pageId = pages.floorEntry((Integer)Integer.valueOf((int)m.start())).getValue().id;
            }
            int x0 = Integer.parseInt(m.group("ulx"));
            int y0 = Integer.parseInt(m.group("uly"));
            int x1 = Integer.parseInt(m.group("lrx"));
            int y1 = Integer.parseInt(m.group("lry"));
            String text = StringEscapeUtils.unescapeXml(m.group("text"));
            if (text.contains(this.startHlTag)) {
                currentHighlight = UUID.randomUUID();
            }
            wordBoxes.add(new OcrBox(text, pageId, x0, y0, x1, y1, currentHighlight));
            boolean endOfHl = text.contains(this.endHlTag) || ocrFragment.substring(m.end(), Math.min(m.end() + this.endHlTag.length(), ocrFragment.length())).equals(this.endHlTag);
            if (!endOfHl) continue;
            currentHighlight = null;
        }
        return wordBoxes;
    }
}

