/*
 * Decompiled with CFR 0.152.
 */
package de.jungblut.nlp;

import com.google.common.base.Preconditions;
import de.jungblut.datastructure.ArrayUtils;
import de.jungblut.datastructure.StringPool;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

public final class TokenizerUtils {
    public static final String END_TAG = "<END>";
    public static final String START_TAG = "<START>";
    public static final String SEPARATORS = " \r\n\t.,;:'\"()?!\\-/|\u201c\u201e";
    private static final Pattern SEPARATORS_PATTERN;
    private static final Pattern WHITESPACE_PATTERN;
    private static final char[] CHARACTER_REPLACE_MAPPING;
    private static final Pattern NUMERIC_PATTERN;
    private static final CharSequence NON_BREAKING_WHITESPACE;

    private TokenizerUtils() {
        throw new IllegalAccessError();
    }

    public static String[] removeMatchingRegex(String regex, String replacement, String[] tokens, boolean removeEmpty) {
        String[] tk = new String[tokens.length];
        for (int i = 0; i < tokens.length; ++i) {
            tk[i] = tokens[i].replaceAll(regex, replacement);
        }
        if (removeEmpty) {
            tk = TokenizerUtils.removeEmpty(tk);
        }
        return tk;
    }

    public static String[] qGramTokenize(String key, int size) {
        return TokenizerUtils.nShinglesTokenize(key, size);
    }

    public static String[] nShinglesTokenize(String key, int size) {
        if (key.length() < size) {
            return new String[]{key};
        }
        int listSize = key.length() - size + 1;
        ArrayList<String> list = new ArrayList<String>(listSize);
        for (int i = 0; i < listSize; ++i) {
            int upperBound = i + size;
            list.add(new String(key.substring(i, upperBound)));
        }
        return list.toArray(new String[list.size()]);
    }

    public static String[] whiteSpaceTokenize(String text) {
        return WHITESPACE_PATTERN.split(text);
    }

    public static String[] deduplicateTokens(String[] tokens) {
        LinkedHashSet set = new LinkedHashSet();
        Collections.addAll(set, tokens);
        return set.toArray(new String[set.size()]);
    }

    public static String[] wordTokenize(String text) {
        return TokenizerUtils.wordTokenize(text, false);
    }

    public static String[] wordTokenize(String text, boolean keepSeperators) {
        if (keepSeperators) {
            StringTokenizer tkns = new StringTokenizer(text, SEPARATORS, true);
            int countTokens = tkns.countTokens();
            String[] toReturn = new String[countTokens];
            int i = 0;
            while (countTokens-- > 0) {
                toReturn[i] = tkns.nextToken();
                if (toReturn[i].charAt(0) <= ' ') continue;
                ++i;
            }
            return Arrays.copyOf(toReturn, i);
        }
        return SEPARATORS_PATTERN.split(text);
    }

    public static String[] wordTokenize(String text, String regex) {
        return text.split(regex);
    }

    public static String[] normalizeTokens(String[] tokens, boolean removeEmpty) {
        for (int i = 0; i < tokens.length; ++i) {
            tokens[i] = TokenizerUtils.normalizeString(tokens[i]);
        }
        if (removeEmpty) {
            tokens = TokenizerUtils.removeEmpty(tokens);
        }
        return tokens;
    }

    public static String normalizeString(String token) {
        char[] charArray = token.toCharArray();
        char[] toReturn = new char[charArray.length];
        int index = 0;
        for (int i = 0; i < charArray.length; ++i) {
            char x = charArray[i];
            if (x >= CHARACTER_REPLACE_MAPPING.length || CHARACTER_REPLACE_MAPPING[x] <= '\u0000') continue;
            toReturn[index++] = CHARACTER_REPLACE_MAPPING[x];
        }
        return String.valueOf(Arrays.copyOf(toReturn, index));
    }

    public static String[] removeEmpty(String[] arr) {
        ArrayList<String> list = new ArrayList<String>();
        for (String s : arr) {
            if (s == null || s.isEmpty()) continue;
            list.add(s);
        }
        return list.toArray(new String[list.size()]);
    }

    public static String[] whiteSpaceTokenizeNGrams(String text, int size) {
        String[] whiteSpaceTokenize = TokenizerUtils.whiteSpaceTokenize(text);
        return TokenizerUtils.buildNGrams(whiteSpaceTokenize, size);
    }

    public static String[] buildNGrams(String[] tokens, int size) {
        if (tokens.length < size) {
            return tokens;
        }
        ArrayList<String> list = new ArrayList<String>();
        int endIndex = tokens.length - size + 1;
        for (int i = 0; i < endIndex; ++i) {
            StringBuilder tkn = new StringBuilder(tokens[i]);
            int tokenEndIndex = i + size;
            for (int j = i + 1; j < tokenEndIndex; ++j) {
                tkn.append(' ');
                tkn.append(tokens[j]);
            }
            list.add(tkn.toString());
        }
        return list.toArray(new String[list.size()]);
    }

    public static String[] buildNGramsRange(String[] tokens, int startSize, int endSize) {
        String[] tkn = TokenizerUtils.buildNGrams(tokens, startSize);
        for (int i = startSize + 1; i <= endSize; ++i) {
            tkn = ArrayUtils.concat(tkn, TokenizerUtils.buildNGrams(tokens, i));
        }
        return tkn;
    }

    public static String[] internStrings(String[] strings) {
        for (int i = 0; i < strings.length; ++i) {
            strings[i] = strings[i].intern();
        }
        return strings;
    }

    public static String[] internStrings(String[] strings, StringPool pool) {
        Preconditions.checkNotNull((Object)pool, (Object)"Pool shouldn't be null!");
        for (int i = 0; i < strings.length; ++i) {
            strings[i] = pool.pool(strings[i]);
        }
        return strings;
    }

    public static String[] addStartAndEndTags(String[] unigram) {
        String[] tmp = new String[unigram.length + 2];
        System.arraycopy(unigram, 0, tmp, 1, unigram.length);
        tmp[0] = START_TAG;
        tmp[tmp.length - 1] = END_TAG;
        return tmp;
    }

    public static String concat(String[] tokens, String delimiter) {
        int finalIndex = tokens.length - 1;
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < tokens.length; ++i) {
            sb.append(tokens[i]);
            if (i == finalIndex) continue;
            sb.append(delimiter);
        }
        return sb.toString();
    }

    public static String[] numericsToHash(String[] tokens) {
        String[] toReturn = new String[tokens.length];
        for (int i = 0; i < tokens.length; ++i) {
            toReturn[i] = NUMERIC_PATTERN.matcher(tokens[i]).replaceAll("#");
        }
        return toReturn;
    }

    public static String[] trim(String[] tokens) {
        String[] toReturn = new String[tokens.length];
        for (int i = 0; i < tokens.length; ++i) {
            toReturn[i] = tokens[i].trim().replace(NON_BREAKING_WHITESPACE, "");
        }
        return toReturn;
    }

    static {
        int i;
        SEPARATORS_PATTERN = Pattern.compile("[ \r\n\t\\.,;:'\"()?!\\-/|\u201c\u201e]");
        WHITESPACE_PATTERN = Pattern.compile("\\s+");
        CHARACTER_REPLACE_MAPPING = new char[256];
        int lowerDifference = 32;
        for (i = 65; i <= 90; i = (int)((char)(i + 1))) {
            TokenizerUtils.CHARACTER_REPLACE_MAPPING[i] = (char)(i + lowerDifference);
        }
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[32] = 32;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[228] = 228;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[246] = 246;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[252] = 252;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[196] = 228;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[214] = 246;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[220] = 252;
        TokenizerUtils.CHARACTER_REPLACE_MAPPING[223] = 223;
        for (i = 48; i <= 57; i = (int)((char)(i + 1))) {
            TokenizerUtils.CHARACTER_REPLACE_MAPPING[i] = i;
        }
        for (i = 97; i <= 122; i = (int)((char)(i + 1))) {
            TokenizerUtils.CHARACTER_REPLACE_MAPPING[i] = i;
        }
        NUMERIC_PATTERN = Pattern.compile("[0-9]");
        NON_BREAKING_WHITESPACE = "\u00a0";
    }
}

