/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.pipe;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureSequence;
import cc.mallet.types.Instance;
import java.io.Serializable;

public class FixedVocabTokenizer
extends Pipe
implements Serializable {
    public int minimumLength = 3;
    int[] tokenBuffer = new int[100000];
    int[] characterBuffer = new int[1000];
    static final long serialVersionUID = 1L;

    public FixedVocabTokenizer(Alphabet alphabet) {
        super(alphabet, null);
    }

    @Override
    public Instance pipe(Instance instance) {
        int numTokens;
        Alphabet alphabet = this.getAlphabet();
        int underscoreCodePoint = Character.codePointAt("_", 0);
        if (instance.getData() instanceof CharSequence) {
            String token;
            CharSequence characters = (CharSequence)instance.getData();
            int length = -1;
            numTokens = 0;
            int totalCodePoints = Character.codePointCount(characters, 0, characters.length());
            int i = 0;
            while (i < totalCodePoints) {
                if (numTokens == this.tokenBuffer.length - 1) {
                    System.err.println("Overflowed token buffer");
                    break;
                }
                int codePoint = Character.codePointAt(characters, i);
                int codePointType = Character.getType(codePoint);
                if (codePointType == 2 || codePointType == 1 || codePoint == underscoreCodePoint) {
                    this.characterBuffer[++length] = codePoint;
                } else if (codePointType == 20 || codePointType == 9) {
                    if (length != -1) {
                        this.characterBuffer[++length] = codePoint;
                    }
                } else if (codePointType == 12 || codePointType == 13 || codePointType == 14 || codePointType == 22 || codePointType == 23 || codePointType == 21 || codePointType == 29 || codePointType == 30 || codePointType == 24) {
                    if (length != -1) {
                        String token2 = new String(this.characterBuffer, 0, length + 1);
                        if (alphabet.contains(token2) && length >= this.minimumLength) {
                            this.tokenBuffer[numTokens] = alphabet.lookupIndex(token2);
                            ++numTokens;
                        }
                        length = -1;
                    }
                } else if (codePointType == 8 || codePointType == 7 || codePointType == 6 || codePointType == 3 || codePointType == 4 || codePointType == 5) {
                    this.characterBuffer[++length] = codePoint;
                }
                ++i;
            }
            if (length != -1 && alphabet.contains(token = new String(this.characterBuffer, 0, length + 1)) && length >= this.minimumLength) {
                this.tokenBuffer[numTokens] = alphabet.lookupIndex(token);
                ++numTokens;
            }
        } else {
            throw new IllegalArgumentException("Looking for a CharSequence, found a " + instance.getData().getClass());
        }
        int[] tokens = new int[numTokens];
        System.arraycopy(this.tokenBuffer, 0, tokens, 0, numTokens);
        instance.setData(new FeatureSequence(alphabet, tokens));
        return instance;
    }
}

