package smile.nlp.tokenizer;

import java.util.ArrayList;
import java.util.regex.Pattern;

/* loaded from: input_file:smile/nlp/tokenizer/SimpleTokenizer.class */
public class SimpleTokenizer implements Tokenizer {
    private static final Pattern WONT_CONTRACTION = Pattern.compile("(?i)\\b(w)(on't)\\b");
    private static final Pattern SHANT_CONTRACTION = Pattern.compile("(?i)\\b(sha)(n't)\\b");
    private static final Pattern AINT_CONTRACTION = Pattern.compile("(?i)\\b(a)(in't)\\b");
    private static final Pattern[] NOT_CONTRACTIONS = {Pattern.compile("(?i)\\b(can)('t|not)\\b"), Pattern.compile("(?i)(.)(n't)\\b")};
    private static final Pattern[] CONTRACTIONS2 = {Pattern.compile("(?i)(.)('ll|'re|'ve|'s|'m|'d)\\b"), Pattern.compile("(?i)\\b(D)('ye)\\b"), Pattern.compile("(?i)\\b(Gim)(me)\\b"), Pattern.compile("(?i)\\b(Gon)(na)\\b"), Pattern.compile("(?i)\\b(Got)(ta)\\b"), Pattern.compile("(?i)\\b(Lem)(me)\\b"), Pattern.compile("(?i)\\b(Mor)('n)\\b"), Pattern.compile("(?i)\\b(T)(is)\\b"), Pattern.compile("(?i)\\b(T)(was)\\b"), Pattern.compile("(?i)\\b(Wan)(na)\\b")};
    private static final Pattern[] CONTRACTIONS3 = {Pattern.compile("(?i)\\b(Whad)(dd)(ya)\\b"), Pattern.compile("(?i)\\b(Wha)(t)(cha)\\b")};
    private static final Pattern[] DELIMITERS = {Pattern.compile("([^\\w\\.\\'\\-\\/,&])"), Pattern.compile("(,\\s)"), Pattern.compile("('\\s)"), Pattern.compile("\\. *(\\n|$)")};
    private static final Pattern WHITESPACE = Pattern.compile("\\s+");
    private boolean splitContraction;

    public SimpleTokenizer() {
        this(false);
    }

    public SimpleTokenizer(boolean z) {
        this.splitContraction = z;
    }

    @Override // smile.nlp.tokenizer.Tokenizer
    public String[] split(String str) {
        if (this.splitContraction) {
            str = AINT_CONTRACTION.matcher(SHANT_CONTRACTION.matcher(WONT_CONTRACTION.matcher(str).replaceAll("$1ill not")).replaceAll("$1ll not")).replaceAll("$1m not");
            for (Pattern pattern : NOT_CONTRACTIONS) {
                str = pattern.matcher(str).replaceAll("$1 not");
            }
            for (Pattern pattern2 : CONTRACTIONS2) {
                str = pattern2.matcher(str).replaceAll("$1 $2");
            }
            for (Pattern pattern3 : CONTRACTIONS3) {
                str = pattern3.matcher(str).replaceAll("$1 $2 $3");
            }
        }
        String[] split = WHITESPACE.split(DELIMITERS[3].matcher(DELIMITERS[2].matcher(DELIMITERS[1].matcher(DELIMITERS[0].matcher(str).replaceAll(" $1 ")).replaceAll(" $1")).replaceAll(" $1")).replaceAll(" . "));
        if (split.length > 1 && split[split.length - 1].equals(".") && EnglishAbbreviations.contains(split[split.length - 2])) {
            split[split.length - 2] = split[split.length - 2] + ".";
        }
        ArrayList arrayList = new ArrayList();
        for (String str2 : split) {
            if (!str2.isEmpty()) {
                arrayList.add(str2);
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }
}
