package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;

import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.TokenizerCategory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo;
import org.apache.hyracks.util.string.UTF8StringUtil;

/* loaded from: input_file:org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.class */
public class DelimitedUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
    protected short tokenCount;
    private boolean tokenCountCalculated;
    private int originalIndex;

    public DelimitedUTF8StringBinaryTokenizer(boolean z, boolean z2, ITokenFactory iTokenFactory) {
        super(z, z2, iTokenFactory);
    }

    @Override // org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.AbstractUTF8StringBinaryTokenizer, org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer
    public void reset(byte[] bArr, int i, int i2) {
        super.reset(bArr, i, i2);
        this.tokenCount = (short) 0;
        this.tokenCountCalculated = false;
        this.originalIndex = this.byteIndex;
    }

    @Override // org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer
    public boolean hasNext() {
        while (this.byteIndex < this.sentenceEndOffset && isSeparator(UTF8StringUtil.charAt(this.sentenceBytes, this.byteIndex))) {
            this.byteIndex += UTF8StringUtil.charSize(this.sentenceBytes, this.byteIndex);
        }
        return this.byteIndex < this.sentenceEndOffset;
    }

    public static boolean isSeparator(char c) {
        return (Character.isLetterOrDigit(c) || Character.getType(c) == 5 || Character.getType(c) == 11) ? false : true;
    }

    @Override // org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer
    public void next() {
        int i = 0;
        int i2 = this.byteIndex;
        while (this.byteIndex < this.sentenceEndOffset && !isSeparator(UTF8StringUtil.charAt(this.sentenceBytes, this.byteIndex))) {
            this.byteIndex += UTF8StringUtil.charSize(this.sentenceBytes, this.byteIndex);
            i++;
        }
        int i3 = 1;
        if (i > 0 && !this.ignoreTokenCount) {
            for (int i4 = 0; i4 < this.tokensStart.length(); i4++) {
                if (i == this.tokensLength.get(i4)) {
                    int i5 = this.tokensStart.get(i4);
                    i3++;
                    int i6 = 0;
                    int i7 = 0;
                    while (true) {
                        if (i7 >= i) {
                            break;
                        }
                        if (Character.toLowerCase(UTF8StringUtil.charAt(this.sentenceBytes, i2 + i6)) != Character.toLowerCase(UTF8StringUtil.charAt(this.sentenceBytes, i5 + i6))) {
                            i3--;
                            break;
                        } else {
                            i6 += UTF8StringUtil.charSize(this.sentenceBytes, i2 + i6);
                            i7++;
                        }
                    }
                }
            }
            this.tokensStart.add(i2);
            this.tokensLength.add(i);
        }
        this.token.reset(this.sentenceBytes, i2, this.byteIndex, i, i3);
        this.tokenCount = (short) (this.tokenCount + 1);
    }

    @Override // org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer
    public short getTokensCount() {
        if (!this.tokenCountCalculated) {
            this.tokenCount = (short) 0;
            boolean z = true;
            while (this.originalIndex < this.sentenceEndOffset) {
                if (isSeparator(UTF8StringUtil.charAt(this.sentenceBytes, this.originalIndex))) {
                    z = true;
                } else if (z) {
                    this.tokenCount = (short) (this.tokenCount + 1);
                    z = false;
                }
                this.originalIndex += UTF8StringUtil.charSize(this.sentenceBytes, this.originalIndex);
            }
        }
        return this.tokenCount;
    }

    @Override // org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer
    public TokenizerInfo.TokenizerType getTokenizerType() {
        return TokenizerInfo.TokenizerType.STRING;
    }

    @Override // org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer
    public TokenizerCategory getTokenizerCategory() {
        return TokenizerCategory.WORD;
    }
}
