package com.itextpdf.pdfocr.tesseract4;

import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.commons.utils.SystemUtil;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.pdfocr.TextInfo;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrInputTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4ExceptionMessageConstant;
import com.itextpdf.pdfocr.tesseract4.logs.Tesseract4LogMessageConstant;
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.select.Elements;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/itextpdf/pdfocr/tesseract4/TesseractHelper.class */
public class TesseractHelper {
    private static final Logger LOGGER = LoggerFactory.getLogger(TesseractHelper.class);
    private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
    private static final Pattern BBOX_COORDINATE_PATTERN = Pattern.compile(".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
    private static final Pattern WCONF_PATTERN = Pattern.compile("^.*(x_wconf *\\d+).*$");
    private static final int BBOX_ARRAY_SIZE = 4;
    private static final int LEFT_IDX = 0;
    private static final int TOP_IDX = 1;
    private static final int RIGHT_IDX = 2;
    private static final int BOTTOM_IDX = 3;
    private static final float PX_TO_PT = 0.75f;
    private static final String NEW_LINE_PATTERN = "\n+";
    private static final String SPACE_PATTERN = " +";
    private static final String NEW_LINE_OR_SPACE_PATTERN = "[\n ]+";
    private static final String PAGE_PREFIX_PATTERN = "page_";
    private static final String OCR_PAGE = "ocr_page";
    private static final String OCR_LINE = "ocr_line";
    private static final String OCR_CAPTION = "ocr_caption";
    private static final String OCRX_WORD = "ocrx_word";
    private static final String TITLE = "title";
    private static final String X_WCONF = "x_wconf";

    private TesseractHelper() {
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static Map<Integer, List<TextInfo>> parseHocrFile(List<File> list, List<File> list2, Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) throws IOException {
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        LinkedHashMap linkedHashMap2 = new LinkedHashMap();
        for (int i = LEFT_IDX; i < list.size(); i += TOP_IDX) {
            File file = list.get(i);
            List<String> list3 = LEFT_IDX;
            if (list2 != null) {
                list3 = Files.readAllLines(list2.get(i).toPath(), StandardCharsets.UTF_8);
            }
            if (file != null && Files.exists(Paths.get(file.getAbsolutePath(), new String[LEFT_IDX]), new LinkOption[LEFT_IDX])) {
                FileInputStream fileInputStream = new FileInputStream(file.getAbsolutePath());
                Iterator it = Jsoup.parse(fileInputStream, StandardCharsets.UTF_8.name(), file.getAbsolutePath()).getElementsByClass(OCR_PAGE).iterator();
                while (it.hasNext()) {
                    Element element = (Element) it.next();
                    String[] split = element.id().split(PAGE_PREFIX_PATTERN);
                    int parseInt = Integer.parseInt(split[split.length - TOP_IDX]);
                    List<TextInfo> textData = getTextData(element, tesseract4OcrEngineProperties, list3, linkedHashMap2);
                    if (textData.size() > 0) {
                        if (linkedHashMap.containsKey(Integer.valueOf(parseInt))) {
                            parseInt = ((Integer) Collections.max(linkedHashMap.keySet())).intValue() + TOP_IDX;
                        }
                        linkedHashMap.put(Integer.valueOf(parseInt), textData);
                    }
                }
                fileInputStream.close();
            }
        }
        Iterator it2 = linkedHashMap2.values().iterator();
        while (it2.hasNext()) {
            LOGGER.warn(MessageFormatUtil.format(Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, new Object[]{((Node) it2.next()).toString()}));
        }
        return linkedHashMap;
    }

    static Rectangle getAlignedBBox(Element element, TextPositioning textPositioning, Rectangle rectangle, Map<String, Node> map) {
        Rectangle parseBBox = parseBBox(element, rectangle, map);
        if (TextPositioning.BY_WORDS_AND_LINES == textPositioning || TextPositioning.BY_WORDS == textPositioning) {
            Rectangle parseBBox2 = parseBBox(element.parent(), rectangle, map);
            if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
                parseBBox.setBbox(parseBBox.getLeft(), parseBBox2.getBottom(), parseBBox.getRight(), parseBBox2.getTop());
            }
            detectAndFixBrokenBBoxes(element, parseBBox, parseBBox2, rectangle, map);
        }
        return parseBBox;
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v13, types: [java.util.List] */
    static Rectangle parseBBox(Node node, Rectangle rectangle, Map<String, Node> map) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = BBOX_PATTERN.matcher(node.attr(TITLE));
        if (matcher.matches()) {
            Matcher matcher2 = BBOX_COORDINATE_PATTERN.matcher(matcher.group());
            if (matcher2.matches()) {
                for (int i = LEFT_IDX; i < BBOX_ARRAY_SIZE; i += TOP_IDX) {
                    arrayList.add(Float.valueOf(Float.parseFloat(matcher2.group(i + TOP_IDX))));
                }
            }
        }
        if (arrayList.size() == 0) {
            arrayList = Arrays.asList(Float.valueOf(0.0f), Float.valueOf(0.0f), Float.valueOf(0.0f), Float.valueOf(0.0f));
            String attr = node.attr("id");
            if (attr != null && !map.containsKey(attr)) {
                map.put(attr, node);
            }
        }
        return rectangle == null ? new Rectangle(toPoints(((Float) arrayList.get(LEFT_IDX)).floatValue()), toPoints(((Float) arrayList.get(TOP_IDX)).floatValue()), toPoints(((Float) arrayList.get(RIGHT_IDX)).floatValue()), toPoints(((Float) arrayList.get(BOTTOM_IDX)).floatValue() - ((Float) arrayList.get(TOP_IDX)).floatValue())) : new Rectangle(0.0f, 0.0f).setBbox(toPoints(((Float) arrayList.get(LEFT_IDX)).floatValue()), rectangle.getTop() - toPoints(((Float) arrayList.get(TOP_IDX)).floatValue()), toPoints(((Float) arrayList.get(RIGHT_IDX)).floatValue()), rectangle.getTop() - toPoints(((Float) arrayList.get(BOTTOM_IDX)).floatValue()));
    }

    static void detectAndFixBrokenBBoxes(Element element, Rectangle rectangle, Rectangle rectangle2, Rectangle rectangle3, Map<String, Node> map) {
        if (rectangle.getLeft() < rectangle2.getLeft() || rectangle.getLeft() > rectangle2.getRight()) {
            if (element.previousElementSibling() == null) {
                rectangle.setX(rectangle2.getLeft());
            } else {
                rectangle.setX(parseBBox(element.previousElementSibling(), rectangle3, map).getRight());
            }
        }
        if (rectangle.getRight() > rectangle2.getRight() || rectangle.getRight() < rectangle2.getLeft()) {
            if (element.nextElementSibling() == null) {
                rectangle.setBbox(rectangle.getLeft(), rectangle.getBottom(), rectangle2.getRight(), rectangle.getTop());
            } else {
                rectangle.setBbox(rectangle.getLeft(), rectangle.getBottom(), parseBBox(element.nextElementSibling(), rectangle3, map).getLeft(), rectangle.getTop());
            }
        }
    }

    static float toPixels(float f) {
        return f / PX_TO_PT;
    }

    static float toPoints(float f) {
        return f * PX_TO_PT;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void deleteFile(String str) {
        if (str != null) {
            try {
                if (!str.isEmpty() && Files.exists(Paths.get(str, new String[LEFT_IDX]), new LinkOption[LEFT_IDX])) {
                    Files.delete(Paths.get(str, new String[LEFT_IDX]));
                }
            } catch (IOException | SecurityException e) {
                LOGGER.info(MessageFormatUtil.format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, new Object[]{str, e.getMessage()}));
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static String readTxtFile(File file) {
        String str = LEFT_IDX;
        try {
            str = new String(Files.readAllBytes(file.toPath()), StandardCharsets.UTF_8);
        } catch (IOException e) {
            LOGGER.error(MessageFormatUtil.format(Tesseract4LogMessageConstant.CANNOT_READ_FILE, new Object[]{file.getAbsolutePath(), e.getMessage()}));
        }
        return str;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void writeToTextFile(String str, String str2) {
        try {
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(str), StandardCharsets.UTF_8);
            Throwable th = LEFT_IDX;
            try {
                try {
                    outputStreamWriter.write(str2);
                    if (outputStreamWriter != null) {
                        if (th != null) {
                            try {
                                outputStreamWriter.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            outputStreamWriter.close();
                        }
                    }
                } finally {
                }
            } finally {
            }
        } catch (IOException e) {
            throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_WRITE_TO_FILE, e);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void runCommand(String str, List<String> list) throws PdfOcrTesseract4Exception {
        runCommand(str, list, null);
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void runCommand(String str, List<String> list, String str2) throws PdfOcrTesseract4Exception {
        try {
            String join = String.join(" ", list);
            if (SystemUtil.runProcessAndWait(str, join, str2)) {
                return;
            }
            LOGGER.error(MessageFormatUtil.format(Tesseract4LogMessageConstant.COMMAND_FAILED, new Object[]{str + " " + join}));
            throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED);
        } catch (IOException | InterruptedException e) {
            LOGGER.error(MessageFormatUtil.format(Tesseract4LogMessageConstant.COMMAND_FAILED, new Object[]{e.getMessage()}));
            throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED);
        }
    }

    private static List<TextInfo> getTextData(Element element, Tesseract4OcrEngineProperties tesseract4OcrEngineProperties, List<String> list, Map<String, Node> map) {
        Rectangle parseBBox = parseBBox(element, null, map);
        List asList = Arrays.asList(OCR_LINE, OCR_CAPTION);
        Elements elements = new Elements();
        for (int i = LEFT_IDX; i < asList.size(); i += TOP_IDX) {
            Elements elementsByClass = element.getElementsByClass((String) asList.get(i));
            for (int i2 = LEFT_IDX; i2 < elementsByClass.size(); i2 += TOP_IDX) {
                elements.add(elementsByClass.get(i2));
            }
        }
        return getTextData(elements, tesseract4OcrEngineProperties, list, parseBBox, map);
    }

    private static List<TextInfo> getTextData(List<Element> list, Tesseract4OcrEngineProperties tesseract4OcrEngineProperties, List<String> list2, Rectangle rectangle, Map<String, Node> map) {
        ArrayList arrayList = new ArrayList();
        for (Element element : list) {
            if (!element.text().isEmpty() && isElementConfident(element, tesseract4OcrEngineProperties.getMinimalConfidenceLevel())) {
                String findHocrLineInTxt = findHocrLineInTxt(element, list2);
                if (tesseract4OcrEngineProperties.getTextPositioning() == TextPositioning.BY_WORDS || tesseract4OcrEngineProperties.getTextPositioning() == TextPositioning.BY_WORDS_AND_LINES) {
                    Iterator<TextInfo> it = getTextDataForWords(element, findHocrLineInTxt, tesseract4OcrEngineProperties.getTextPositioning(), rectangle, map).iterator();
                    while (it.hasNext()) {
                        arrayList.add(it.next());
                    }
                } else {
                    Iterator<TextInfo> it2 = getTextDataForLines(element, findHocrLineInTxt, rectangle, map).iterator();
                    while (it2.hasNext()) {
                        arrayList.add(it2.next());
                    }
                }
            }
        }
        return arrayList;
    }

    private static boolean isElementConfident(Element element, int i) {
        if (i == 0) {
            return true;
        }
        int i2 = LEFT_IDX;
        int i3 = LEFT_IDX;
        for (Element element2 : element.childNodes()) {
            if (element2 instanceof Element) {
                Matcher matcher = WCONF_PATTERN.matcher(element2.attr(TITLE));
                if (matcher.matches()) {
                    String str = LEFT_IDX;
                    try {
                        str = matcher.group(TOP_IDX);
                    } catch (Exception e) {
                    }
                    if (str != null) {
                        i2 += Integer.parseInt(str.replaceAll(X_WCONF, "").trim());
                        i3 += TOP_IDX;
                    }
                }
            }
        }
        return i3 <= 0 || i2 / i3 >= i;
    }

    private static List<TextInfo> getTextDataForWords(Element element, String str, TextPositioning textPositioning, Rectangle rectangle, Map<String, Node> map) {
        ArrayList arrayList = new ArrayList();
        if (str == null) {
            Iterator it = element.getElementsByClass(OCRX_WORD).iterator();
            while (it.hasNext()) {
                Element element2 = (Element) it.next();
                addToTextData(arrayList, element2.text(), getAlignedBBox(element2, textPositioning, rectangle, map));
            }
        } else {
            ArrayList arrayList2 = new ArrayList();
            String[] split = str.replaceAll(NEW_LINE_PATTERN, "").replaceAll(SPACE_PATTERN, " ").split(" ");
            Iterator it2 = element.getElementsByClass(OCRX_WORD).iterator();
            while (it2.hasNext()) {
                Element element3 = (Element) it2.next();
                arrayList2.add(new TextInfo(element3.text(), getAlignedBBox(element3, textPositioning, rectangle, map)));
                if (split[LEFT_IDX].replaceAll(NEW_LINE_OR_SPACE_PATTERN, "").equals(getTextInfosText(arrayList2).replaceAll(SPACE_PATTERN, ""))) {
                    split = (String[]) Arrays.copyOfRange(split, TOP_IDX, split.length);
                    addToTextData(arrayList, mergeTextInfos(arrayList2));
                    arrayList2.clear();
                }
            }
        }
        return arrayList;
    }

    private static List<TextInfo> getTextDataForLines(Element element, String str, Rectangle rectangle, Map<String, Node> map) {
        ArrayList arrayList = new ArrayList();
        Rectangle alignedBBox = getAlignedBBox(element, TextPositioning.BY_LINES, rectangle, map);
        if (str == null) {
            addToTextData(arrayList, element.text(), alignedBBox);
        } else {
            addToTextData(arrayList, str, alignedBBox);
        }
        return arrayList;
    }

    private static void addToTextData(List<TextInfo> list, String str, Rectangle rectangle) {
        list.add(new TextInfo(str, rectangle));
    }

    private static void addToTextData(List<TextInfo> list, TextInfo textInfo) {
        addToTextData(list, textInfo.getText(), textInfo.getBboxRect());
    }

    private static String getTextInfosText(List<TextInfo> list) {
        StringBuilder sb = new StringBuilder();
        Iterator<TextInfo> it = list.iterator();
        while (it.hasNext()) {
            sb.append(it.next().getText());
        }
        return sb.toString();
    }

    private static TextInfo mergeTextInfos(List<TextInfo> list) {
        TextInfo textInfo = new TextInfo(list.get(LEFT_IDX));
        for (int i = TOP_IDX; i < list.size(); i += TOP_IDX) {
            textInfo.setText(textInfo.getText() + list.get(i).getText());
            Rectangle bboxRect = textInfo.getBboxRect();
            Rectangle bboxRect2 = list.get(i).getBboxRect();
            textInfo.setBboxRect(new Rectangle(0.0f, 0.0f).setBbox(bboxRect.getLeft(), Math.min(bboxRect.getBottom(), bboxRect2.getBottom()), bboxRect2.getRight(), Math.max(bboxRect.getTop(), bboxRect2.getTop())));
        }
        return textInfo;
    }

    private static String findHocrLineInTxt(Element element, List<String> list) {
        if (list == null) {
            return null;
        }
        String replaceAll = element.text().replaceAll(SPACE_PATTERN, "");
        if (replaceAll.isEmpty()) {
            return null;
        }
        for (String str : list) {
            if (str.replaceAll(SPACE_PATTERN, "").equals(replaceAll)) {
                return str;
            }
        }
        return null;
    }
}
