package jp.go.nict.langrid.wrapper.ws_1_2.util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:jp/go/nict/langrid/wrapper/ws_1_2/util/TextParser.class */
public class TextParser {
    private static final int MAX_OFFSET = 100000;
    private static ArrayList<String> allowedTags = new ArrayList<>();
    private static HashMap<String, String> retCodeMap = new HashMap<>();

    public static String preprocessOriginal(String str, String str2) {
        return ExceptionWord.encodeInvalidSeparatorWithLanguage(ExceptionWord.encodeExceptionWord(stripTags(str2.replace("\\", "").replace("&nbsp;", " ").replaceAll("[ \\t\\x0B\\f]*\\r\\n[ \\t\\x0B\\f]*", retCodeMap.get("\r\n")).replaceAll("[ \\t\\x0B\\f]*\\r[ \\t\\x0B\\f]*", retCodeMap.get("\r")).replaceAll("[ \\t\\x0B\\f]*\\n[ \\t\\x0B\\f]*", retCodeMap.get("\n")), allowedTags)), str);
    }

    public static HashMap<String, String> getFirstSentence(String str, String str2) {
        String trim;
        int i;
        String substring;
        String substring2;
        ArrayList<String> separators = ExceptionWord.getSeparators(str);
        HashMap<String, String> hashMap = new HashMap<>();
        String str3 = "";
        int i2 = MAX_OFFSET;
        while (true) {
            trim = str2.trim();
            Matcher matcher = Pattern.compile("<\\/?[^<>]*>").matcher(trim);
            if (!matcher.find()) {
                break;
            }
            if (matcher.start() != 0) {
                i2 = matcher.start();
                break;
            }
            str3 = matcher.group();
            str2 = trim.substring(matcher.end());
        }
        int i3 = MAX_OFFSET;
        Iterator<String> it = separators.iterator();
        while (it.hasNext()) {
            int indexOf = trim.indexOf(it.next());
            if (indexOf != -1 && indexOf + 1 < i3) {
                i3 = indexOf + 1;
            }
        }
        Matcher matcher2 = Pattern.compile("\\[\\[#ret[^]]*\\]\\]").matcher(trim);
        if (matcher2.find()) {
            i = matcher2.start();
            if (i == 0) {
                hashMap.put("first", matcher2.group());
                hashMap.put("tag", "");
                hashMap.put("remain", trim.substring(matcher2.end()));
                return hashMap;
            }
        } else {
            i = MAX_OFFSET;
        }
        int i4 = i3 < i2 ? i3 : i2;
        if (i < i4) {
            i4 = i;
        }
        if (i4 == MAX_OFFSET) {
            substring = trim;
            substring2 = "";
        } else {
            substring = trim.substring(0, i4);
            substring2 = trim.substring(i4);
        }
        hashMap.put("first", ExceptionWord.decode(substring).trim());
        hashMap.put("tag", str3);
        hashMap.put("remain", substring2);
        return hashMap;
    }

    public static String stripTags(String str, ArrayList<String> arrayList) {
        String[] strArr;
        int i;
        if (arrayList != null) {
            strArr = (String[]) arrayList.toArray(new String[0]);
            Arrays.sort(strArr);
        } else {
            strArr = new String[0];
        }
        Matcher matcher = Pattern.compile("<[/!]?([^\\s>]*)\\s*[^>]*>", 2).matcher(str);
        StringBuilder sb = new StringBuilder();
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            if (Arrays.binarySearch(strArr, matcher.group(1)) < 0) {
                String substring = str.substring(i, matcher.start());
                if (sb.lastIndexOf(">") < sb.length() - 1 && sb.lastIndexOf(" ") < sb.length() - 1 && sb.lastIndexOf("]]") < sb.length() - 2) {
                    sb.append(" ").append(substring.trim());
                } else if (sb.lastIndexOf(" ") == sb.length() - 1) {
                    sb.append(substring.trim());
                } else {
                    sb.append(substring);
                }
            } else {
                sb.append(str.substring(i, matcher.end()));
            }
            i2 = matcher.end();
        }
        if (i <= 0) {
            return str;
        }
        sb.append(str.substring(i));
        return sb.toString().trim();
    }

    public static String getRetSymbol(String str) {
        return retCodeMap.get(str);
    }

    static {
        allowedTags.addAll(new ArrayList(Arrays.asList("ul", "li", "ol", "dt", "dl", "dd", "table", "tr", "th", "td", "br", "h1", "h2", "h3", "h4", "h5", "h6")));
        retCodeMap.put("\n", "[[#ret_n]]");
        retCodeMap.put("\r", "[[#ret_r]]");
        retCodeMap.put("\r\n", "[[#ret_rn]]");
    }
}
